Пример #1
0
def cook_asm(out, code, save):
    ymm_map = {'jV': 15, 'sV': 14, 'zV': 13}
    E.append_save_registers(ymm_map, 12, save.values())
    scratch = ['%%ymm%s' % i for i in ymm_map.values()]
    rr_map = {7: 'rcx', 10: 'rbx', 9: 'rbp', 8: 'r12', 6: 'r13', 5: 'r14'}
    scratch_map = {0: 'rax', 1: 'r8', 2: 'r9', 3: 'r10', 4: 'r11'}
    scratch += ['%' + i for i in scratch_map.values()]
    rr_map.update(scratch_map)
    code = E.replace_wi(code, rr_map)
    code = E.replace_ymm(code, 15, ymm_map)  # only one 32-bit register
    code = re.sub(r'\bdd\b', '%%rdx', code)
    for v in 'up', 'rp':
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)
    data = {
        'input': ['up S u_p', 'rp D r_p'],
        'input_output': ['vp +c v_p'],
        'clobber': 'cc memory %rdx ' + ' '.join(scratch),
        'macro_name': 'mul6_broadwell',
        'macro_parameters': 'r_p u_p v_p',
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
    }
    P.write_cpp_code(out, code, data)
    out.write('\n')
    for i in g_wr_code.strip().split('\n'):
        out.write(P.append_backslash(i, 88))
    out.write('    }\n')
Пример #2
0
def cook_macro(o, code, xmm_save, var_map):
    #discard save and restore instructions, and comments
    code = [c for c in code if (c.find('!') == -1) and (c.find('#') == -1)]

    code = '\n'.join(code)
    code = P.replace_symbolic_names_wr(code, var_map)
    code = code.replace('%', '%%')
    scratch = extract_register_name(var_map) - set(['rdx', 'rdi', 'rsi'])

    data = {
        'input': ['rp D r_p'],
        'input_output': ['up +S u_p', 'vp +d v_p'],
        'macro_name': P.guess_subroutine_name(sys.argv[1]) + '_macro',
        'macro_parameters': 'r_p u_p v_p',
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'clobber': 'cc memory ' + ' '.join(['%' + s for s in scratch]),
    }

    P.write_cpp_code(o, code, data)
    o.write('\n')
    n = data['macro_name']
    ww = g_wr_code.strip().replace('@w', n + '_wr').replace('@n',
                                                            n).split('\n')
    for w in ww:
        o.write(P.append_backslash(w, 84))
    o.write('    }\n')
Пример #3
0
def do_it(tgt):
    data = {
        'macro_name': 'mpn_mul2_add_4k',
        'scratch': ['w%s s%s' % (i, i) for i in range(4)],
        'vars_type': dict([('s%s' % i, 0) for i in range(4)]),
        'default_type': 'mp_limb_t',
        'input_output': ['rp +r r_p', 'up +r u_p', 'vp +r v_p', 'lc +c k'],
        'clobber': 'memory cc',
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'macro_parameters': 'r_p u_p v_p k',
    }

    all_vars = P.extract_int_vars_name(data['scratch']) + \
            P.extract_int_vars_name(data['input_output'])
    code = g_code.strip()
    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)
    for l in 'loop nearly_done'.split(' '):
        code = re.sub(r'\b%s\b' % l, l + '%=', code)

    P.write_cpp_code(tgt, code, data)

    # append wrapper code
    tgt.write('\n')
    for i in g_wr_code.strip().split('\n'):
        tgt.write(P.append_backslash(i, 77))
    tgt.write('    }\n')
Пример #4
0
def do_it(tgt):
    code = g_code.strip()
    code = re.sub(r'\bw3\b', '%%rax', code)
    scr = dict([('s%s' % i, 0) for i in range(3)])
    scr['s3'] = 'uint16_t'

    data = {
        'macro_name': 'mpn_sub_1x',
        'scratch': ['w%s s%s' % (i, i) for i in range(3)] + ['kk s3'],
        'vars_type': scr,
        'default_type': 'mp_limb_t',
        'input_output': ['nn +a n', 'rp +r r_p', 'ap +r a_p', 'bp +r b_p'],
        'clobber': 'memory cc',
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'macro_parameters': 'r_p a_p b_p n',
    }

    all_vars = P.extract_int_vars_name(data['scratch']) + \
            P.extract_int_vars_name(data['input_output'])

    for x in 'done loop'.split(' '):
        for y in 'short long'.split(' '):
            z = x + '_' + y
            code = re.sub(r'\b%s\b' % z, z + '%=', code)

    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)

    P.write_cpp_code(tgt, code, data)
def do_it(tgt):
    data = {
        'macro_name': 'mpn_bdiv_dbm1c_4k_inplace',
        'scratch': ['w%s s%s' % (i, i) for i in range(4)],
        'vars_type': dict([('s%s' % i, 0) for i in range(4)]),
        'default_type': 'mp_limb_t',
        'input_output': ['rp +r r_p', 'mm +d m', 'nn +r n'],
        'clobber': 'memory cc',
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'macro_parameters': 'r_p n m',
    }

    all_vars = P.extract_int_vars_name(data['scratch']) + \
            P.extract_int_vars_name(data['input_output'])
    code = g_code.strip()
    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)
    for i in 'loop enter_here'.split(' '):
        code = re.sub(r'\b%s\b' % i, i + '%=', code + ' ').rstrip()

    P.write_cpp_code(tgt, code, data)
    tgt.write('\n')
    for i in g_wr_code.strip().split('\n'):
        tgt.write(P.append_backslash(i, 88))
    tgt.write('    }\n')
    tgt.write(g_func_code)
def do_it_6_or_7(tgt, data, tail):
    code = (g_code_0 + tail).strip()
    all_vars = P.extract_int_vars_name(data['scratch']) + \
            P.extract_int_vars_name(data['input'])
    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)

    vars_type = dict((v, '') for v in P.extract_ext_vars_name(data['scratch']))
    data['vars_type'] = vars_type

    P.write_cpp_code(tgt, code, data)
Пример #7
0
def do_it(tgt):
    assert g_n == 8
    scratch_tgt = 'rax rcx rdx r8 r9 r10 r11'.split(' ')
    scratch_tgt = ['%' + s for s in scratch_tgt]
    scratch_src = 'w0 w1 w2 w3 w4 w5 w8'.split(' ')
    data = {
        'macro_name': 'toom22_add_sub_%s' % g_n,
        'input': ['rp D r_p', 'cp S c_p'],
        'scratch': ['bJ x b_j', 'bS x b_s'],
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'vars_type': {
            'b_j': '',
            'b_s': ''
        },
        'default_type': '__m256i',
        'macro_parameters': 'r_p c_p',
        'clobber': "cc memory " + ' '.join(scratch_tgt),
    }
    code = g_code_8

    for i in range(len(scratch_tgt)):
        code = re.sub(r'\b%s\b' % scratch_src[i], '%' + scratch_tgt[i], code)
    code = re.sub(r'\bw6\b', '%%rbx', code)
    code = re.sub(r'\bw7\b', '%%rbp', code)

    for i in ['rp', 'cp']:
        code = re.sub(r'\b%s\b' % i, '%%[%s]' % i, code)

    for i in ['J', 'S']:
        code = re.sub(r'\bb%s\b' % i, '%%[b%s]' % i, code)
        code = re.sub(r'\bb%s_128\b' % i, '%%x[b%s]' % i, code)

    for i in ['again', 'done']:
        code = re.sub(r'\b%s\b' % i, i + '%=', code)

    P.write_cpp_code(tgt, code, data)

    # TODO: is it faster to use both adcx and adox instead of adcq?

    tgt.write('\n')

    sub_to_add = {'subq': 'addq', 'sbbq': 'adcq'}
    for k, v in sub_to_add.items():
        code = re.sub(r'\b%s\b' % k, v, code)

    data['macro_name'] = 'toom22_add_add_%s' % g_n
    del data['source']

    P.write_cpp_code(tgt, code, data)
def do_it(tgt):
    data = {
        'macro_name': 'shr1_7_avx2',
        'input': ['rp r tgt', 'sp r src'],
        'clobber': 'memory ' + ' '.join('ymm%s' % i for i in range(4)),
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'macro_parameters': 'tgt src',
    }

    all_vars = P.extract_int_vars_name(data['input'])
    code = g_code.strip() + ' '
    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)
    for i in range(4):
        code = re.sub(r'\bw%s\b' % i, '%%ymm' + str(i + 12), code)

    P.write_cpp_code(tgt, code.rstrip(), data)
Пример #9
0
def do_it(tgt, code, b_ofs):
    data = {
        'macro_name': P.guess_subroutine_name(sys.argv[1]),
        'scratch': ['w%s s%s' % (i, i) for i in range(4)],
        'vars_type': dict([('s%s' % i, 0) for i in range(4)]),
        'default_type': 'mp_limb_t',
        'input': ['rp r r_p', 'ap r a_p'],
        'clobber': 'memory cc',
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'macro_parameters': 'r_p a_p',
    }

    all_vars = P.extract_int_vars_name(data['scratch']) + ['ap', 'rp']
    code = '\n'.join([chew_line(x, b_ofs) for x in P.cutoff_comments(code)])
    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)

    P.write_cpp_code(tgt, code, data)
def do_it(tgt):
    data = {
        'macro_name': 'shr1_6k_plus1_avx2',
        'input_output': ['rp +r adr', 'kk +r k'],
        'clobber': 'memory cc ' + ' '.join('ymm%s' % i for i in range(4)),
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'macro_parameters': 'adr k',
    }

    all_vars = P.extract_int_vars_name(data['input_output'])
    code = g_code.strip() + ' '
    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)
    for i in range(4):
        code = re.sub(r'\bw%s\b' % i, '%%ymm' + str(i + 12), code)
    for i in 'loop nearly_done'.split(' '):
        code = re.sub(r'\b%s\b' % i, i + '%=', code)

    P.write_cpp_code(tgt, code.rstrip(), data)
def do_it(tgt):
    data = {
            'macro_name': 'mpn_sub_4k_inplace',
            'scratch': ['w%s s%s' % (i, i) for i in range(4)],
            'vars_type': dict([('s%s' %i, 0) for i in range(4)]),
            'default_type': 'mp_limb_t',
            'input_output': ['ca +r carry', 'cp +r c_p', 'ap +r a_p', 'lc +r loop_count'],
            'clobber': 'memory cc',
            'source': os.path.basename(sys.argv[0]),
            'code_language': 'asm',
            'macro_parameters': 'carry c_p a_p loop_count',
            }

    all_vars = P.extract_int_vars_name(data['scratch']) + \
            P.extract_int_vars_name(data['input_output'])
    code = g_code.strip()
    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)
    code = re.sub(r'\bloop\b', 'loop%=', code)

    P.write_cpp_code(tgt, code, data)
Пример #12
0
def do_it(tgt):
    data = {
            'macro_name': 'mpn_sub_n_small',
            'scratch': ['w0 s0'],
            'vars_type': {'s0': 0},
            'default_type': 'mp_limb_t',
            'input_output': ['rp +r r_p', 'ap +r a_p', 'bp +r b_p', 'lc +r n'],
            'clobber': 'memory cc',
            'source': os.path.basename(sys.argv[0]),
            'code_language': 'asm',
            'macro_parameters': 'r_p a_p b_p n',
            }

    all_vars = P.extract_int_vars_name(data['scratch']) + \
            P.extract_int_vars_name(data['input_output'])
    code = g_code.strip()
    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)
    for i in 'loop', 'done':
        code = re.sub(r'\b%s\b' % i, i + '%=', code)

    P.write_cpp_code(tgt, code, data)
Пример #13
0
def do_it(tgt):
    data = {
        'macro_name': 'mpn_less_3arg_hole',
        'scratch': ['rr result'] + ['w%s s%s' % (i, i) for i in range(3)],
        'vars_type': dict([('s%s' % i, 0) for i in range(3)]),
        'default_type': 'mp_limb_t',
        'input': ['up v_head', 'vp v_tail'],
        'clobber': 'cc',
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'macro_parameters': 'result v_head v_tail',
    }

    all_vars = P.extract_int_vars_name(data['scratch']) + \
            P.extract_int_vars_name(data['input'])
    code = g_code.strip()
    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)
    for x in 'done loop'.split(' '):
        code = re.sub(r'\b%s\b' % x, x + '%=', code)

    P.write_cpp_code(tgt, code, data)
Пример #14
0
def do_it(tgt):
    data = {
        'macro_name': 'mpn_le_%s' % g_n,
        'input': ['ap a_p', 'bp b_p'],
        'scratch': ['re tgt', 'w0 scratch'],
        'clobber': 'cc',
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'vars_type': {
            'scratch': 'uint64_t'
        },
        'macro_parameters': 'tgt a_p b_p',
    }
    ofs = 8 * (g_n - 1)
    code = [
        'movq %s(ap), w0' % ofs, 'xorq re, re',
        'subq %s(bp), w0' % ofs, 'jnz done'
    ]
    # carry flag is set iff subtraction result is negative

    i = g_n - 2
    while i > 0:
        ofs = 8 * i
        code += ['movq %s(ap), w0' % ofs, 'subq %s(bp), w0' % ofs, 'jnz done']
        i -= 1

    code += ['movq (ap), w0', 'subq (bp), w0', 'done:', 'adcq $0, re']

    code = '\n'.join(code)

    all_vars = P.extract_int_vars_name(data['scratch']) + \
            P.extract_int_vars_name(data['input'])
    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)
    code = re.sub(r'\bdone\b', 'done%=', code)

    P.write_cpp_code(tgt, code, data)
def do_it(tgt):
    data = {
        'macro_name': 'mpn_sub_inplace',
        'scratch': ['w%s s%s' % (i, i) for i in range(4)],
        'vars_type': dict([('s%s' % i, 0) for i in range(4)]),
        'default_type': 'mp_limb_t',
        'input_output': ['nn +r n', 'cp +r tgt', 'ap +r src'],
        'clobber': 'memory cc',
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'macro_parameters': 'tgt src n',
    }

    all_vars = P.extract_int_vars_name(data['scratch']) + \
            P.extract_int_vars_name(data['input_output'])
    code = g_code.strip()
    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)
    for x in 'done loop'.split(' '):
        for y in 'short long'.split(' '):
            z = x + '_' + y
            code = re.sub(r'\b%s\b' % z, z + '%=', code)

    P.write_cpp_code(tgt, code, data)
def do_it(tgt):
    data = {
        'macro_name': 'mpn_bdiv_dbm1c_4k',
        'input_output': ['rp +r r_p', 'up +r u_p', 'mm +d m_m', 'nn +r n'],
        'clobber': 'memory cc %rax %rcx %r8',
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'macro_parameters': 'r_p u_p n m_m',
    }

    code = re.sub(r'\baa\b', '%%rax', g_code.strip())
    code = re.sub(r'\bdd\b', '%%rcx', code)
    code = re.sub(r'\bhh\b', '%%r8', code)
    for i in 'loop tail enter_here'.split(' '):
        code = re.sub(r'\b%s\b' % i, i + '%=', code + ' ').rstrip()
    for v in 'rp up mm nn'.split(' '):
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)

    P.write_cpp_code(tgt, code, data)
    tgt.write('\n')
    for i in g_wr_code.strip().split('\n'):
        tgt.write(P.append_backslash(i, 88))
    tgt.write('    }\n')
    tgt.write(g_func_code)
Пример #17
0
def do_it(tgt):
    data = {
        'macro_name': 'mpn_sub%s' % g_n,
        'input': ['rp r_p', 'up u_p', 'vp v_p'],
        'scratch': ['w%s w_%s' % (i, i) for i in range(4)],
        'clobber': 'cc memory',
        'default_type': 'uint64_t',
        'macro_parameters': 'r_p u_p v_p',
        'source': os.path.basename(sys.argv[0]),
        'code_language': 'asm',
        'default_type': 'uint64_t',
    }
    if g_n == 6:
        do_it_6_or_7(tgt, data, g_tail_6)
        return
    if g_n == 7:
        do_it_6_or_7(tgt, data, g_tail_7)
        return
    assert g_n % 4 == 0
    loop_count = g_n / 4 - 1
    code = g_code_0.strip()
    ofs = 0
    for i in range(loop_count):
        code += '\n' + update_ofs(g_code_m, ofs)
        ofs += 32
    code += '\n' + update_ofs(g_code_e, ofs)
    code = code.replace(' 0(', ' (')
    all_vars = P.extract_int_vars_name(data['scratch']) + \
            P.extract_int_vars_name(data['input'])
    for v in all_vars:
        code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code)

    vars_type = dict((v, '') for v in P.extract_ext_vars_name(data['scratch']))
    data['vars_type'] = vars_type

    P.write_cpp_code(tgt, code, data)