示例#1
0
def osaca_analyse_instrumented_assembly(instrumented_assembly_file,
                                        micro_architecture):
    """
    Run OSACA analysis on an instrumented assembly.

    :param instrumented_assembly_file: path of assembly that was built with markers
    :param micro_architecture: micro architecture string as taken by OSACA.
                               one of: SNB, IVB, HSW, BDW, SKL
    :return: a dictionary with the following keys:
        - 'output': the output of the iaca executable
        - 'throughput': the block throughput in cycles for one possibly vectorized loop iteration
        - 'port cycles': dict, mapping port name to number of active cycles
        - 'uops': total number of Uops
    """
    result = {}
    isa = osaca.MachineModel.get_isa_for_arch(micro_architecture)
    parser = osaca.get_asm_parser(micro_architecture)
    with open(instrumented_assembly_file) as f:
        parsed_code = parser.parse_file(f.read())
    kernel = osaca.reduce_to_section(parsed_code, isa)
    osaca_machine_model = osaca.MachineModel(arch=micro_architecture)
    semantics = osaca.ArchSemantics(machine_model=osaca_machine_model)
    semantics.add_semantics(kernel)
    semantics.assign_optimal_throughput(kernel)

    kernel_graph = osaca.KernelDG(kernel, parser, osaca_machine_model)
    frontend = osaca.Frontend(instrumented_assembly_file,
                              arch=micro_architecture)

    # Throughput Analysis
    throughput_values = semantics.get_throughput_sum(kernel)
    # LCD Latency Analysis
    lcd_dict = kernel_graph.get_loopcarried_dependencies()
    max_lcd = 0
    for dep in lcd_dict:
        max_lcd = max(
            max_lcd,
            sum([
                instr_form['latency_lcd']
                for instr_form in lcd_dict[dep]['dependencies']
            ]))

    result['output'] = frontend.full_analysis(kernel,
                                              kernel_graph,
                                              verbose=True)
    result['port cycles'] = OrderedDict(
        list(zip(osaca_machine_model['ports'], throughput_values)))
    result['throughput'] = max(throughput_values + [max_lcd])
    result['uops'] = None  # Not given by OSACA

    unmatched_ratio = osaca.get_unmatched_instruction_ratio(kernel)
    if unmatched_ratio > 0.1:
        print(
            'WARNING: {:.0%} of the instruction could not be matched during incore analysis '
            'with OSACA. Fix this by extending OSACAs instruction form database with the '
            'required instructions.'.format(unmatched_ratio),
            file=sys.stderr)

    return result
示例#2
0
def llvm_mca_analyse_instrumented_assembly(
        instrumented_assembly_file, micro_architecture, isa='x86'):
    """
    Run LLVM-MCA analysis on an instrumented assembly.

    :param instrumented_assembly_file: path of assembly that was built with markers
    :param micro_architecture: micro architecture string as taken by OSACA.
                               one of: SNB, IVB, HSW, BDW, SKL
    :return: a dictionary with the following keys:
        - 'output': the output of the iaca executable
        - 'throughput': the block throughput in cycles for one possibly vectorized loop iteration
        - 'port cycles': dict, mapping port name to number of active cycles
        - 'uops': total number of Uops
    """
    result = {}
    with open(instrumented_assembly_file) as f:
        parsed_code = parse_asm(f.read(), isa)
    kernel = osaca.reduce_to_section(parsed_code, isa)
    assembly_section = '\n'.join([l.line for l in kernel])

    output = subprocess.check_output(['llvm-mca']+micro_architecture.split(' '),
                                     input=assembly_section.encode('utf-8')).decode('utf-8')
    result['output'] = output

    # Extract port names
    port_names = OrderedDict()
    m = re.search(r'Resources:\n(?:[^\n]+\n)+', output)
    for m in re.finditer(r'(\[[0-9\.]+\])\s+-\s+([a-zA-Z0-9]+)', m.group()):
        port_names[m.group(1)] = m.group(2)

    # Extract cycles per port
    port_cycles = OrderedDict()
    m = re.search(r'Resource pressure per iteration:\n[^\n]+\n[^\n]+', output)
    port_cycle_lines = m.group().split('\n')[1:]
    for port, cycles in zip(port_cycle_lines[0].split(), port_cycle_lines[1].split()):
        if cycles == '-':
            cycles = 0.0
        port_cycles[port_names[port]] = float(cycles)

    result['port cycles'] = port_cycles
    result['throughput'] = max(port_cycles.values())

    # Extract uops
    uops = 0
    uops_raw = re.search(r'\n\[1\](\s+\[[0-9\.]+\]\s+)+Instructions:\n(:?\s*[0-9\.]+\s+[^\n]+\n)+',
                         output).group()
    for l in uops_raw.strip().split('\n')[2:]:
        uops += int(l.strip().split(' ')[0])

    result['uops'] = uops

    return result
示例#3
0
def llvm_mca_analyse_instrumented_assembly(instrumented_assembly_file,
                                           micro_architecture,
                                           isa='x86'):
    """
    Run LLVM-MCA analysis on an instrumented assembly.

    :param instrumented_assembly_file: path of assembly that was built with markers
    :param micro_architecture: micro architecture string as taken by OSACA.
                               one of: SNB, IVB, HSW, BDW, SKL
    :return: a dictionary with the following keys:
        - 'output': the output of the iaca executable
        - 'throughput': the block throughput in cycles for one possibly vectorized loop iteration
        - 'port cycles': dict, mapping port name to number of active cycles
        - 'uops': total number of Uops
    """
    result = {}
    with open(instrumented_assembly_file) as f:
        parsed_code = parse_asm(f.read(), isa)
    kernel = osaca.reduce_to_section(parsed_code, isa)
    assembly_section = '\n'.join([l.line for l in kernel])

    output = subprocess.check_output(
        ['llvm-mca'] + micro_architecture.split(' ') + [
            '--timeline', '--timeline-max-cycles=1000',
            '--timeline-max-iterations=4'
        ],
        input=assembly_section.encode('utf-8')).decode('utf-8')
    result['output'] = output

    # Extract port names
    port_names = OrderedDict()
    m = re.search(r'Resources:\n(?:[^\n]+\n)+', output)
    for m in re.finditer(r'(\[[0-9\.]+\])\s+-\s+([a-zA-Z0-9]+)', m.group()):
        port_names[m.group(1)] = m.group(2)

    # Extract cycles per port
    port_cycles = OrderedDict()
    m = re.search(r'Resource pressure per iteration:\n[^\n]+\n[^\n]+', output)
    port_cycle_lines = m.group().split('\n')[1:]
    for port, cycles in zip(port_cycle_lines[0].split(),
                            port_cycle_lines[1].split()):
        if cycles == '-':
            cycles = 0.0
        if port_names[port] in port_cycles:
            # Some architecures have multiple "ports" per resource in LLVM-MCA
            # e.g., Sandybridge as a Port23 resource which is found at [6.0] and [6.1]
            # we will consider the maximum of both
            port_cycles[port_names[port]] = max(float(cycles),
                                                port_cycles[port_names[port]])
        else:
            port_cycles[port_names[port]] = float(cycles)
    result['port cycles'] = port_cycles

    # Extract throughput including loop-carried-dependecy latency
    timeline_lines = [
        l for l in output.split('\n') if re.match(r'\[[0-9]+,[0-9]+\]', l)
    ]
    lcd = 0
    for l in timeline_lines:
        if l.startswith('[0,'):
            last_instr_index = re.match(r'\[0,([0-9]+)\]', l).group(1)
            lcd_start = l.index('R')
        elif l.startswith('[1,' + last_instr_index + ']'):
            lcd = l.index('R') - lcd_start
            break
    result['throughput'] = max(max(port_cycles.values()), lcd)

    # Extract uops
    uops = 0
    uops_raw = re.search(
        r'\n\[1\](\s+\[[0-9\.]+\]\s+)+Instructions:\n(:?\s*[0-9\.]+\s+[^\n]+\n)+',
        output).group()
    for l in uops_raw.strip().split('\n')[2:]:
        uops += int(l.strip().split(' ')[0])

    result['uops'] = uops

    return result
示例#4
0
def build_mark_run_all_kernels(measurements=True, osaca=True, iaca=True, llvm_mca=True):
    arch = get_current_arch()
    if arch is None:
        arches = arch_info.keys()
        islocal = False
    else:
        islocal = True
        arches = [arch]
        ainfo = arch_info.get(arch)
        if 'prepare' in ainfo:
            for cmd in ainfo['prepare']:
                check_call(cmd)
    for arch in arches:
        ainfo = arch_info.get(arch)
        print(arch)
        data_path = Path(f"build/{arch}/data.pkl")
        if data_path.exists():
            with data_path.open('rb') as f:
                data = pickle.load(f)
        else:
            data = []
        data_lastsaved = deepcopy(data)
        for compiler, compiler_cflags in ainfo['cflags'].items():
            if not shutil.which(compiler) and islocal:
                print(compiler, "not found in path! Skipping...")
                continue
            for cflags_name, cflags in compiler_cflags.items():
                for kernel in get_kernels():
                    print(f"{kernel:<15} {arch:>5} {compiler:>5} {cflags_name:>6}",
                        end=": ", flush=True)
                    row = list([r for r in data
                                if r['arch'] == arch and r['kernel'] == kernel and
                                r['compiler'] == compiler and r['cflags_name'] == cflags_name])
                    if row:
                        row = row[0]
                    else:
                        orig_row = None
                        row = {
                            'arch': arch,
                            'kernel': kernel,
                            'compiler': compiler,
                            'cflags_name': cflags_name,
                            'element_size': 8,
                        }
                        data.append(row)

                    # Build
                    print("build", end="", flush=True)
                    asm_path, exec_path, overwrite = build_kernel(
                        kernel, arch, compiler, cflags, cflags_name, dontbuild=not islocal)

                    if overwrite:
                        # clear all measurment information
                        row['best_length'] = None
                        row['best_runtime'] = None
                        row['L2_traffic'] = None
                        row['allruns'] = None
                        row['perfevents'] = None

                    # Mark for IACA, OSACA and LLVM-MCA
                    print("mark", end="", flush=True)
                    try:
                        marked_asmfile, marked_objfile, row['pointer_increment'], overwrite = mark(
                            asm_path, compiler, cflags, isa=ainfo['isa'], overwrite=overwrite)
                        row['marking_error'] = None
                    except ValueError as e:
                        row['marking_error'] = str(e)
                        print(":", e)
                        continue

                    if overwrite:
                        # clear all model generated information
                        for model in ['IACA', 'OSACA', 'LLVM-MCA', 'Ithemal']:
                            for k in ['ports', 'prediction', 'throughput', 'cp', 'lcd', 'raw']:
                                row[model+'_'+k] = None
                    
                    for model in ['IACA', 'OSACA', 'LLVM-MCA', 'Ithemal']:
                        for k in ['ports', 'prediction', 'throughput', 'cp', 'lcd', 'raw']:
                            if model+'_'+k not in row:
                                row[model+'_'+k] = None

                    # Analyze with IACA, if requested and configured
                    if iaca and ainfo['IACA'] is not None:
                        print("IACA", end="", flush=True)
                        if not row.get('IACA_ports'):
                            row['IACA_raw'] = iaca_analyse_instrumented_binary(
                                marked_objfile, micro_architecture=ainfo['IACA'])
                            row['IACA_ports'] = \
                                {k: v/(row['pointer_increment']/row['element_size'])
                                for k,v in row['IACA_raw']['port cycles'].items()}
                            row['IACA_prediction'] = row['IACA_raw']['throughput']/(
                                row['pointer_increment']/row['element_size'])
                            row['IACA_throughput'] = max(row['IACA_ports'].values())
                            print(". ", end="", flush=True)
                        else:
                            print("! ", end="", flush=True)

                    # Analyze with OSACA, if requested
                    if osaca:
                        print("OSACA", end="", flush=True)
                        if not row.get('OSACA_ports'):
                            row['OSACA_raw'] = osaca_analyse_instrumented_assembly(
                                marked_asmfile, micro_architecture=ainfo['OSACA'],
                                assign_optimal_throughput=ainfo.get('assign_optimal_throughput',
                                                                    True))
                            row['OSACA_ports'] = \
                                {k: v/(row['pointer_increment']/row['element_size'])
                                for k,v in row['OSACA_raw']['port cycles'].items()}
                            row['OSACA_prediction'] = row['OSACA_raw']['throughput']/(
                                row['pointer_increment']/row['element_size'])
                            row['OSACA_throughput'] = max(row['OSACA_ports'].values())
                            row['OSACA_cp'] = row['OSACA_raw']['cp_latency']/(
                                row['pointer_increment']/row['element_size'])
                            row['OSACA_lcd'] = row['OSACA_raw']['lcd']/(
                                row['pointer_increment']/row['element_size'])
                            print(". ", end="", flush=True)
                        else:
                            print("! ", end="", flush=True)

                    # Analyze with LLVM-MCA, if requested and configured
                    if llvm_mca and ainfo['LLVM-MCA'] is not None:
                        print("LLVM-MCA", end="", flush=True)
                        if not row.get('LLVM-MCA_ports'):
                            row['LLVM-MCA_raw'] = llvm_mca_analyse_instrumented_assembly(
                                marked_asmfile,
                                micro_architecture=ainfo['LLVM-MCA'],
                                isa=ainfo['isa'])
                            row['LLVM-MCA_ports'] = \
                                {k: v/(row['pointer_increment']/row['element_size'])
                                for k,v in row['LLVM-MCA_raw']['port cycles'].items()}
                            row['LLVM-MCA_prediction'] =row['LLVM-MCA_raw']['throughput']/(
                                row['pointer_increment']/row['element_size'])
                            row['LLVM-MCA_throughput'] = max(row['LLVM-MCA_ports'].values())
                            row['LLVM-MCA_cp'] = row['LLVM-MCA_raw']['cp_latency']/(
                                row['pointer_increment']/row['element_size'])
                            row['LLVM-MCA_lcd'] = row['LLVM-MCA_raw']['lcd']/(
                                row['pointer_increment']/row['element_size'])
                            print(". ", end="", flush=True)
                        else:
                            print("! ", end="", flush=True)
                    
                    # Analyze with Ithemal, if not running local and configured
                    if ainfo['Ithemal'] is not None and not islocal:
                        print("Ithemal", end="", flush=True)
                        if not row.get('Ithemal_prediction'):
                            with open(marked_asmfile) as f:
                                parsed_code = parse_asm(f.read(), ainfo['isa'])
                            kernel = reduce_to_section(parsed_code, ainfo['isa'])
                            row['Ithemal_prediction'] = get_ithemal_prediction(
                                get_intel_style_code(marked_objfile), model=ainfo['Ithemal'])
                            print(". ", end="", flush=True)
                        else:
                            print("! ", end="", flush=True)

                    if measurements and islocal:
                        # run measurements if on same hardware
                        print("scale", end="", flush=True)
                        if not row.get('allruns'):
                            # find best length with concurrent L2 measurement
                            scaling_runs, best = scalingrun(exec_path)
                            row['best_length'] = best[0]
                            row['best_runtime'] = best[2]
                            row['L2_traffic'] = best[3]
                            row['allruns'] = scaling_runs
                            print(f"({best[0]}). ", end="", flush=True)
                        else:
                            print(f"({row.get('best_length', None)})! ", end="", flush=True)

                    print()

                # dump to file
                if data != data_lastsaved:
                    print('saving... ', end="", flush=True)
                    with data_path.open('wb') as f:
                        try:
                            pickle.dump(data, f)
                            data_lastsaved = deepcopy(data)
                            print('saved!')
                        except KeyboardInterrupt:
                            f.seek(0)
                            pickle.dump(data, f)
                            print('saved!')
                            sys.exit()