Пример #1
0
        MOV(reg_t8_offset.as_dword, 8)
        SUB(reg_t8_offset.as_dword, reg_row_start)
        IMUL(reg_t8_offset, reg_t_stride)
        reg_t8 = GeneralPurposeRegister64()
        LEA(reg_t8, [reg_t0 + reg_t8_offset * 1])
        CMP(reg_row_start, 8)
        CMOVAE(reg_t8, reg_t0)

        reg_t0_column_8, reg_t8_column_8 = GeneralPurposeRegister64(), GeneralPurposeRegister64()
        LEA(reg_t0_column_8, [reg_t0 + YMMRegister.size])
        LEA(reg_t8_column_8, [reg_t8 + YMMRegister.size])

        vfft_columns_0_to_8 = [LocalVariable(YMMRegister.size) for _ in range(16)]
        vfft_columns_8_to_16 = [YMMRegister() if i < 4 else LocalVariable(YMMRegister.size) for i in range(16)]

        fft16x16.forward_vfft(reg_t0, reg_t8, reg_t_stride, data_out=vfft_columns_0_to_8,
            reg_row_start=reg_row_start, reg_row_end=reg_row_end, ymm_load_mask=ymm_load_mask_columns_0_to_8)

        ymm_load_mask_columns_8_to_16 = YMMRegister()
        VMOVDQA(ymm_load_mask_columns_8_to_16, load_mask_columns_8_to_16)

        fft16x16.forward_vfft(reg_t0_column_8, reg_t8_column_8, reg_t_stride, data_out=vfft_columns_8_to_16,
            reg_row_start=reg_row_start, reg_row_end=reg_row_end, ymm_load_mask=ymm_load_mask_columns_8_to_16)

        for row_batch_start, row_batch_end in [(0, 2), (2, 5), (5, 8)]:
            ymm_wr_list = [(YMMRegister(), YMMRegister()) for _ in range(row_batch_start, row_batch_end)]
            ymm_wi_list = [(YMMRegister(), YMMRegister()) for _ in range(row_batch_start, row_batch_end)]
            for row_offset, (ymm_wr, ymm_wi) in enumerate(zip(ymm_wr_list, ymm_wi_list)):
                row = row_batch_start + row_offset

                VMOVAPS(ymm_wr[0], vfft_columns_0_to_8[row*2+0])
                VMOVAPS(ymm_wr[1], vfft_columns_8_to_16[row*2+0])
Пример #2
0
        ), GeneralPurposeRegister64()
        LEA(reg_t0_column_8, [reg_t0 + YMMRegister.size])
        LEA(reg_t8_column_8, [reg_t8 + YMMRegister.size])

        vfft_columns_0_to_8 = [
            LocalVariable(YMMRegister.size) for _ in range(16)
        ]
        vfft_columns_8_to_16 = [
            YMMRegister() if i < 4 else LocalVariable(YMMRegister.size)
            for i in range(16)
        ]

        fft16x16.forward_vfft(reg_t0,
                              reg_t8,
                              reg_t_stride,
                              data_out=vfft_columns_0_to_8,
                              reg_row_start=reg_row_start,
                              reg_row_end=reg_row_end,
                              ymm_load_mask=ymm_load_mask_columns_0_to_8)

        ymm_load_mask_columns_8_to_16 = YMMRegister()
        VMOVDQA(ymm_load_mask_columns_8_to_16, load_mask_columns_8_to_16)

        fft16x16.forward_vfft(reg_t0_column_8,
                              reg_t8_column_8,
                              reg_t_stride,
                              data_out=vfft_columns_8_to_16,
                              reg_row_start=reg_row_start,
                              reg_row_end=reg_row_end,
                              ymm_load_mask=ymm_load_mask_columns_8_to_16)
Пример #3
0
    for i, ymm_i in enumerate(ymm_data):
        VMOVUPS([reg_f + i * YMMRegister.size], ymm_i)

    RETURN()

import fft16x16

with Function("nnp_fft16_8real__fma3", (arg_t, arg_f),
              target=uarch.default + isa.fma3):

    reg_t0 = GeneralPurposeRegister64()
    LOAD.ARGUMENT(reg_t0, arg_t)

    reg_f = GeneralPurposeRegister64()
    LOAD.ARGUMENT(reg_f, arg_f)

    reg_stride = GeneralPurposeRegister64()
    MOV(reg_stride, YMMRegister.size)

    reg_t8 = GeneralPurposeRegister64()
    LEA(reg_t8, [reg_t0 + 8 * YMMRegister.size])

    fft16x16.forward_vfft(
        reg_t0,
        reg_t8,
        reg_stride,
        data_out=[yword[reg_f + YMMRegister.size * i] for i in range(16)])

    RETURN()