def cook_asm(out, code, save): ymm_map = {'jV': 15, 'sV': 14, 'zV': 13} E.append_save_registers(ymm_map, 12, save.values()) scratch = ['%%ymm%s' % i for i in ymm_map.values()] rr_map = {7: 'rcx', 10: 'rbx', 9: 'rbp', 8: 'r12', 6: 'r13', 5: 'r14'} scratch_map = {0: 'rax', 1: 'r8', 2: 'r9', 3: 'r10', 4: 'r11'} scratch += ['%' + i for i in scratch_map.values()] rr_map.update(scratch_map) code = E.replace_wi(code, rr_map) code = E.replace_ymm(code, 15, ymm_map) # only one 32-bit register code = re.sub(r'\bdd\b', '%%rdx', code) for v in 'up', 'rp': code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) data = { 'input': ['up S u_p', 'rp D r_p'], 'input_output': ['vp +c v_p'], 'clobber': 'cc memory %rdx ' + ' '.join(scratch), 'macro_name': 'mul6_broadwell', 'macro_parameters': 'r_p u_p v_p', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', } P.write_cpp_code(out, code, data) out.write('\n') for i in g_wr_code.strip().split('\n'): out.write(P.append_backslash(i, 88)) out.write(' }\n')
def cook_macro(o, code, xmm_save, var_map): #discard save and restore instructions, and comments code = [c for c in code if (c.find('!') == -1) and (c.find('#') == -1)] code = '\n'.join(code) code = P.replace_symbolic_names_wr(code, var_map) code = code.replace('%', '%%') scratch = extract_register_name(var_map) - set(['rdx', 'rdi', 'rsi']) data = { 'input': ['rp D r_p'], 'input_output': ['up +S u_p', 'vp +d v_p'], 'macro_name': P.guess_subroutine_name(sys.argv[1]) + '_macro', 'macro_parameters': 'r_p u_p v_p', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'clobber': 'cc memory ' + ' '.join(['%' + s for s in scratch]), } P.write_cpp_code(o, code, data) o.write('\n') n = data['macro_name'] ww = g_wr_code.strip().replace('@w', n + '_wr').replace('@n', n).split('\n') for w in ww: o.write(P.append_backslash(w, 84)) o.write(' }\n')
def do_it(tgt): data = { 'macro_name': 'mpn_mul2_add_4k', 'scratch': ['w%s s%s' % (i, i) for i in range(4)], 'vars_type': dict([('s%s' % i, 0) for i in range(4)]), 'default_type': 'mp_limb_t', 'input_output': ['rp +r r_p', 'up +r u_p', 'vp +r v_p', 'lc +c k'], 'clobber': 'memory cc', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'macro_parameters': 'r_p u_p v_p k', } all_vars = P.extract_int_vars_name(data['scratch']) + \ P.extract_int_vars_name(data['input_output']) code = g_code.strip() for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) for l in 'loop nearly_done'.split(' '): code = re.sub(r'\b%s\b' % l, l + '%=', code) P.write_cpp_code(tgt, code, data) # append wrapper code tgt.write('\n') for i in g_wr_code.strip().split('\n'): tgt.write(P.append_backslash(i, 77)) tgt.write(' }\n')
def do_it(tgt): code = g_code.strip() code = re.sub(r'\bw3\b', '%%rax', code) scr = dict([('s%s' % i, 0) for i in range(3)]) scr['s3'] = 'uint16_t' data = { 'macro_name': 'mpn_sub_1x', 'scratch': ['w%s s%s' % (i, i) for i in range(3)] + ['kk s3'], 'vars_type': scr, 'default_type': 'mp_limb_t', 'input_output': ['nn +a n', 'rp +r r_p', 'ap +r a_p', 'bp +r b_p'], 'clobber': 'memory cc', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'macro_parameters': 'r_p a_p b_p n', } all_vars = P.extract_int_vars_name(data['scratch']) + \ P.extract_int_vars_name(data['input_output']) for x in 'done loop'.split(' '): for y in 'short long'.split(' '): z = x + '_' + y code = re.sub(r'\b%s\b' % z, z + '%=', code) for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) P.write_cpp_code(tgt, code, data)
def do_it(tgt): data = { 'macro_name': 'mpn_bdiv_dbm1c_4k_inplace', 'scratch': ['w%s s%s' % (i, i) for i in range(4)], 'vars_type': dict([('s%s' % i, 0) for i in range(4)]), 'default_type': 'mp_limb_t', 'input_output': ['rp +r r_p', 'mm +d m', 'nn +r n'], 'clobber': 'memory cc', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'macro_parameters': 'r_p n m', } all_vars = P.extract_int_vars_name(data['scratch']) + \ P.extract_int_vars_name(data['input_output']) code = g_code.strip() for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) for i in 'loop enter_here'.split(' '): code = re.sub(r'\b%s\b' % i, i + '%=', code + ' ').rstrip() P.write_cpp_code(tgt, code, data) tgt.write('\n') for i in g_wr_code.strip().split('\n'): tgt.write(P.append_backslash(i, 88)) tgt.write(' }\n') tgt.write(g_func_code)
def do_it_6_or_7(tgt, data, tail): code = (g_code_0 + tail).strip() all_vars = P.extract_int_vars_name(data['scratch']) + \ P.extract_int_vars_name(data['input']) for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) vars_type = dict((v, '') for v in P.extract_ext_vars_name(data['scratch'])) data['vars_type'] = vars_type P.write_cpp_code(tgt, code, data)
def do_it(tgt): assert g_n == 8 scratch_tgt = 'rax rcx rdx r8 r9 r10 r11'.split(' ') scratch_tgt = ['%' + s for s in scratch_tgt] scratch_src = 'w0 w1 w2 w3 w4 w5 w8'.split(' ') data = { 'macro_name': 'toom22_add_sub_%s' % g_n, 'input': ['rp D r_p', 'cp S c_p'], 'scratch': ['bJ x b_j', 'bS x b_s'], 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'vars_type': { 'b_j': '', 'b_s': '' }, 'default_type': '__m256i', 'macro_parameters': 'r_p c_p', 'clobber': "cc memory " + ' '.join(scratch_tgt), } code = g_code_8 for i in range(len(scratch_tgt)): code = re.sub(r'\b%s\b' % scratch_src[i], '%' + scratch_tgt[i], code) code = re.sub(r'\bw6\b', '%%rbx', code) code = re.sub(r'\bw7\b', '%%rbp', code) for i in ['rp', 'cp']: code = re.sub(r'\b%s\b' % i, '%%[%s]' % i, code) for i in ['J', 'S']: code = re.sub(r'\bb%s\b' % i, '%%[b%s]' % i, code) code = re.sub(r'\bb%s_128\b' % i, '%%x[b%s]' % i, code) for i in ['again', 'done']: code = re.sub(r'\b%s\b' % i, i + '%=', code) P.write_cpp_code(tgt, code, data) # TODO: is it faster to use both adcx and adox instead of adcq? tgt.write('\n') sub_to_add = {'subq': 'addq', 'sbbq': 'adcq'} for k, v in sub_to_add.items(): code = re.sub(r'\b%s\b' % k, v, code) data['macro_name'] = 'toom22_add_add_%s' % g_n del data['source'] P.write_cpp_code(tgt, code, data)
def do_it(tgt): data = { 'macro_name': 'shr1_7_avx2', 'input': ['rp r tgt', 'sp r src'], 'clobber': 'memory ' + ' '.join('ymm%s' % i for i in range(4)), 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'macro_parameters': 'tgt src', } all_vars = P.extract_int_vars_name(data['input']) code = g_code.strip() + ' ' for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) for i in range(4): code = re.sub(r'\bw%s\b' % i, '%%ymm' + str(i + 12), code) P.write_cpp_code(tgt, code.rstrip(), data)
def do_it(tgt, code, b_ofs): data = { 'macro_name': P.guess_subroutine_name(sys.argv[1]), 'scratch': ['w%s s%s' % (i, i) for i in range(4)], 'vars_type': dict([('s%s' % i, 0) for i in range(4)]), 'default_type': 'mp_limb_t', 'input': ['rp r r_p', 'ap r a_p'], 'clobber': 'memory cc', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'macro_parameters': 'r_p a_p', } all_vars = P.extract_int_vars_name(data['scratch']) + ['ap', 'rp'] code = '\n'.join([chew_line(x, b_ofs) for x in P.cutoff_comments(code)]) for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) P.write_cpp_code(tgt, code, data)
def do_it(tgt): data = { 'macro_name': 'shr1_6k_plus1_avx2', 'input_output': ['rp +r adr', 'kk +r k'], 'clobber': 'memory cc ' + ' '.join('ymm%s' % i for i in range(4)), 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'macro_parameters': 'adr k', } all_vars = P.extract_int_vars_name(data['input_output']) code = g_code.strip() + ' ' for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) for i in range(4): code = re.sub(r'\bw%s\b' % i, '%%ymm' + str(i + 12), code) for i in 'loop nearly_done'.split(' '): code = re.sub(r'\b%s\b' % i, i + '%=', code) P.write_cpp_code(tgt, code.rstrip(), data)
def do_it(tgt): data = { 'macro_name': 'mpn_sub_4k_inplace', 'scratch': ['w%s s%s' % (i, i) for i in range(4)], 'vars_type': dict([('s%s' %i, 0) for i in range(4)]), 'default_type': 'mp_limb_t', 'input_output': ['ca +r carry', 'cp +r c_p', 'ap +r a_p', 'lc +r loop_count'], 'clobber': 'memory cc', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'macro_parameters': 'carry c_p a_p loop_count', } all_vars = P.extract_int_vars_name(data['scratch']) + \ P.extract_int_vars_name(data['input_output']) code = g_code.strip() for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) code = re.sub(r'\bloop\b', 'loop%=', code) P.write_cpp_code(tgt, code, data)
def do_it(tgt): data = { 'macro_name': 'mpn_sub_n_small', 'scratch': ['w0 s0'], 'vars_type': {'s0': 0}, 'default_type': 'mp_limb_t', 'input_output': ['rp +r r_p', 'ap +r a_p', 'bp +r b_p', 'lc +r n'], 'clobber': 'memory cc', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'macro_parameters': 'r_p a_p b_p n', } all_vars = P.extract_int_vars_name(data['scratch']) + \ P.extract_int_vars_name(data['input_output']) code = g_code.strip() for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) for i in 'loop', 'done': code = re.sub(r'\b%s\b' % i, i + '%=', code) P.write_cpp_code(tgt, code, data)
def do_it(tgt): data = { 'macro_name': 'mpn_less_3arg_hole', 'scratch': ['rr result'] + ['w%s s%s' % (i, i) for i in range(3)], 'vars_type': dict([('s%s' % i, 0) for i in range(3)]), 'default_type': 'mp_limb_t', 'input': ['up v_head', 'vp v_tail'], 'clobber': 'cc', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'macro_parameters': 'result v_head v_tail', } all_vars = P.extract_int_vars_name(data['scratch']) + \ P.extract_int_vars_name(data['input']) code = g_code.strip() for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) for x in 'done loop'.split(' '): code = re.sub(r'\b%s\b' % x, x + '%=', code) P.write_cpp_code(tgt, code, data)
def do_it(tgt): data = { 'macro_name': 'mpn_le_%s' % g_n, 'input': ['ap a_p', 'bp b_p'], 'scratch': ['re tgt', 'w0 scratch'], 'clobber': 'cc', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'vars_type': { 'scratch': 'uint64_t' }, 'macro_parameters': 'tgt a_p b_p', } ofs = 8 * (g_n - 1) code = [ 'movq %s(ap), w0' % ofs, 'xorq re, re', 'subq %s(bp), w0' % ofs, 'jnz done' ] # carry flag is set iff subtraction result is negative i = g_n - 2 while i > 0: ofs = 8 * i code += ['movq %s(ap), w0' % ofs, 'subq %s(bp), w0' % ofs, 'jnz done'] i -= 1 code += ['movq (ap), w0', 'subq (bp), w0', 'done:', 'adcq $0, re'] code = '\n'.join(code) all_vars = P.extract_int_vars_name(data['scratch']) + \ P.extract_int_vars_name(data['input']) for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) code = re.sub(r'\bdone\b', 'done%=', code) P.write_cpp_code(tgt, code, data)
def do_it(tgt): data = { 'macro_name': 'mpn_sub_inplace', 'scratch': ['w%s s%s' % (i, i) for i in range(4)], 'vars_type': dict([('s%s' % i, 0) for i in range(4)]), 'default_type': 'mp_limb_t', 'input_output': ['nn +r n', 'cp +r tgt', 'ap +r src'], 'clobber': 'memory cc', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'macro_parameters': 'tgt src n', } all_vars = P.extract_int_vars_name(data['scratch']) + \ P.extract_int_vars_name(data['input_output']) code = g_code.strip() for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) for x in 'done loop'.split(' '): for y in 'short long'.split(' '): z = x + '_' + y code = re.sub(r'\b%s\b' % z, z + '%=', code) P.write_cpp_code(tgt, code, data)
def do_it(tgt): data = { 'macro_name': 'mpn_bdiv_dbm1c_4k', 'input_output': ['rp +r r_p', 'up +r u_p', 'mm +d m_m', 'nn +r n'], 'clobber': 'memory cc %rax %rcx %r8', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'macro_parameters': 'r_p u_p n m_m', } code = re.sub(r'\baa\b', '%%rax', g_code.strip()) code = re.sub(r'\bdd\b', '%%rcx', code) code = re.sub(r'\bhh\b', '%%r8', code) for i in 'loop tail enter_here'.split(' '): code = re.sub(r'\b%s\b' % i, i + '%=', code + ' ').rstrip() for v in 'rp up mm nn'.split(' '): code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) P.write_cpp_code(tgt, code, data) tgt.write('\n') for i in g_wr_code.strip().split('\n'): tgt.write(P.append_backslash(i, 88)) tgt.write(' }\n') tgt.write(g_func_code)
def do_it(tgt): data = { 'macro_name': 'mpn_sub%s' % g_n, 'input': ['rp r_p', 'up u_p', 'vp v_p'], 'scratch': ['w%s w_%s' % (i, i) for i in range(4)], 'clobber': 'cc memory', 'default_type': 'uint64_t', 'macro_parameters': 'r_p u_p v_p', 'source': os.path.basename(sys.argv[0]), 'code_language': 'asm', 'default_type': 'uint64_t', } if g_n == 6: do_it_6_or_7(tgt, data, g_tail_6) return if g_n == 7: do_it_6_or_7(tgt, data, g_tail_7) return assert g_n % 4 == 0 loop_count = g_n / 4 - 1 code = g_code_0.strip() ofs = 0 for i in range(loop_count): code += '\n' + update_ofs(g_code_m, ofs) ofs += 32 code += '\n' + update_ofs(g_code_e, ofs) code = code.replace(' 0(', ' (') all_vars = P.extract_int_vars_name(data['scratch']) + \ P.extract_int_vars_name(data['input']) for v in all_vars: code = re.sub(r'\b%s\b' % v, '%%[%s]' % v, code) vars_type = dict((v, '') for v in P.extract_ext_vars_name(data['scratch'])) data['vars_type'] = vars_type P.write_cpp_code(tgt, code, data)