def osaca_analyse_instrumented_assembly(instrumented_assembly_file, micro_architecture): """ Run OSACA analysis on an instrumented assembly. :param instrumented_assembly_file: path of assembly that was built with markers :param micro_architecture: micro architecture string as taken by OSACA. one of: SNB, IVB, HSW, BDW, SKL :return: a dictionary with the following keys: - 'output': the output of the iaca executable - 'throughput': the block throughput in cycles for one possibly vectorized loop iteration - 'port cycles': dict, mapping port name to number of active cycles - 'uops': total number of Uops """ result = {} isa = osaca.MachineModel.get_isa_for_arch(micro_architecture) parser = osaca.get_asm_parser(micro_architecture) with open(instrumented_assembly_file) as f: parsed_code = parser.parse_file(f.read()) kernel = osaca.reduce_to_section(parsed_code, isa) osaca_machine_model = osaca.MachineModel(arch=micro_architecture) semantics = osaca.ArchSemantics(machine_model=osaca_machine_model) semantics.add_semantics(kernel) semantics.assign_optimal_throughput(kernel) kernel_graph = osaca.KernelDG(kernel, parser, osaca_machine_model) frontend = osaca.Frontend(instrumented_assembly_file, arch=micro_architecture) # Throughput Analysis throughput_values = semantics.get_throughput_sum(kernel) # LCD Latency Analysis lcd_dict = kernel_graph.get_loopcarried_dependencies() max_lcd = 0 for dep in lcd_dict: max_lcd = max( max_lcd, sum([ instr_form['latency_lcd'] for instr_form in lcd_dict[dep]['dependencies'] ])) result['output'] = frontend.full_analysis(kernel, kernel_graph, verbose=True) result['port cycles'] = OrderedDict( list(zip(osaca_machine_model['ports'], throughput_values))) result['throughput'] = max(throughput_values + [max_lcd]) result['uops'] = None # Not given by OSACA unmatched_ratio = osaca.get_unmatched_instruction_ratio(kernel) if unmatched_ratio > 0.1: print( 'WARNING: {:.0%} of the instruction could not be matched during incore analysis ' 'with OSACA. Fix this by extending OSACAs instruction form database with the ' 'required instructions.'.format(unmatched_ratio), file=sys.stderr) return result
def llvm_mca_analyse_instrumented_assembly( instrumented_assembly_file, micro_architecture, isa='x86'): """ Run LLVM-MCA analysis on an instrumented assembly. :param instrumented_assembly_file: path of assembly that was built with markers :param micro_architecture: micro architecture string as taken by OSACA. one of: SNB, IVB, HSW, BDW, SKL :return: a dictionary with the following keys: - 'output': the output of the iaca executable - 'throughput': the block throughput in cycles for one possibly vectorized loop iteration - 'port cycles': dict, mapping port name to number of active cycles - 'uops': total number of Uops """ result = {} with open(instrumented_assembly_file) as f: parsed_code = parse_asm(f.read(), isa) kernel = osaca.reduce_to_section(parsed_code, isa) assembly_section = '\n'.join([l.line for l in kernel]) output = subprocess.check_output(['llvm-mca']+micro_architecture.split(' '), input=assembly_section.encode('utf-8')).decode('utf-8') result['output'] = output # Extract port names port_names = OrderedDict() m = re.search(r'Resources:\n(?:[^\n]+\n)+', output) for m in re.finditer(r'(\[[0-9\.]+\])\s+-\s+([a-zA-Z0-9]+)', m.group()): port_names[m.group(1)] = m.group(2) # Extract cycles per port port_cycles = OrderedDict() m = re.search(r'Resource pressure per iteration:\n[^\n]+\n[^\n]+', output) port_cycle_lines = m.group().split('\n')[1:] for port, cycles in zip(port_cycle_lines[0].split(), port_cycle_lines[1].split()): if cycles == '-': cycles = 0.0 port_cycles[port_names[port]] = float(cycles) result['port cycles'] = port_cycles result['throughput'] = max(port_cycles.values()) # Extract uops uops = 0 uops_raw = re.search(r'\n\[1\](\s+\[[0-9\.]+\]\s+)+Instructions:\n(:?\s*[0-9\.]+\s+[^\n]+\n)+', output).group() for l in uops_raw.strip().split('\n')[2:]: uops += int(l.strip().split(' ')[0]) result['uops'] = uops return result
def llvm_mca_analyse_instrumented_assembly(instrumented_assembly_file, micro_architecture, isa='x86'): """ Run LLVM-MCA analysis on an instrumented assembly. :param instrumented_assembly_file: path of assembly that was built with markers :param micro_architecture: micro architecture string as taken by OSACA. one of: SNB, IVB, HSW, BDW, SKL :return: a dictionary with the following keys: - 'output': the output of the iaca executable - 'throughput': the block throughput in cycles for one possibly vectorized loop iteration - 'port cycles': dict, mapping port name to number of active cycles - 'uops': total number of Uops """ result = {} with open(instrumented_assembly_file) as f: parsed_code = parse_asm(f.read(), isa) kernel = osaca.reduce_to_section(parsed_code, isa) assembly_section = '\n'.join([l.line for l in kernel]) output = subprocess.check_output( ['llvm-mca'] + micro_architecture.split(' ') + [ '--timeline', '--timeline-max-cycles=1000', '--timeline-max-iterations=4' ], input=assembly_section.encode('utf-8')).decode('utf-8') result['output'] = output # Extract port names port_names = OrderedDict() m = re.search(r'Resources:\n(?:[^\n]+\n)+', output) for m in re.finditer(r'(\[[0-9\.]+\])\s+-\s+([a-zA-Z0-9]+)', m.group()): port_names[m.group(1)] = m.group(2) # Extract cycles per port port_cycles = OrderedDict() m = re.search(r'Resource pressure per iteration:\n[^\n]+\n[^\n]+', output) port_cycle_lines = m.group().split('\n')[1:] for port, cycles in zip(port_cycle_lines[0].split(), port_cycle_lines[1].split()): if cycles == '-': cycles = 0.0 if port_names[port] in port_cycles: # Some architecures have multiple "ports" per resource in LLVM-MCA # e.g., Sandybridge as a Port23 resource which is found at [6.0] and [6.1] # we will consider the maximum of both port_cycles[port_names[port]] = max(float(cycles), port_cycles[port_names[port]]) else: port_cycles[port_names[port]] = float(cycles) result['port cycles'] = port_cycles # Extract throughput including loop-carried-dependecy latency timeline_lines = [ l for l in output.split('\n') if re.match(r'\[[0-9]+,[0-9]+\]', l) ] lcd = 0 for l in timeline_lines: if l.startswith('[0,'): last_instr_index = re.match(r'\[0,([0-9]+)\]', l).group(1) lcd_start = l.index('R') elif l.startswith('[1,' + last_instr_index + ']'): lcd = l.index('R') - lcd_start break result['throughput'] = max(max(port_cycles.values()), lcd) # Extract uops uops = 0 uops_raw = re.search( r'\n\[1\](\s+\[[0-9\.]+\]\s+)+Instructions:\n(:?\s*[0-9\.]+\s+[^\n]+\n)+', output).group() for l in uops_raw.strip().split('\n')[2:]: uops += int(l.strip().split(' ')[0]) result['uops'] = uops return result
def build_mark_run_all_kernels(measurements=True, osaca=True, iaca=True, llvm_mca=True): arch = get_current_arch() if arch is None: arches = arch_info.keys() islocal = False else: islocal = True arches = [arch] ainfo = arch_info.get(arch) if 'prepare' in ainfo: for cmd in ainfo['prepare']: check_call(cmd) for arch in arches: ainfo = arch_info.get(arch) print(arch) data_path = Path(f"build/{arch}/data.pkl") if data_path.exists(): with data_path.open('rb') as f: data = pickle.load(f) else: data = [] data_lastsaved = deepcopy(data) for compiler, compiler_cflags in ainfo['cflags'].items(): if not shutil.which(compiler) and islocal: print(compiler, "not found in path! Skipping...") continue for cflags_name, cflags in compiler_cflags.items(): for kernel in get_kernels(): print(f"{kernel:<15} {arch:>5} {compiler:>5} {cflags_name:>6}", end=": ", flush=True) row = list([r for r in data if r['arch'] == arch and r['kernel'] == kernel and r['compiler'] == compiler and r['cflags_name'] == cflags_name]) if row: row = row[0] else: orig_row = None row = { 'arch': arch, 'kernel': kernel, 'compiler': compiler, 'cflags_name': cflags_name, 'element_size': 8, } data.append(row) # Build print("build", end="", flush=True) asm_path, exec_path, overwrite = build_kernel( kernel, arch, compiler, cflags, cflags_name, dontbuild=not islocal) if overwrite: # clear all measurment information row['best_length'] = None row['best_runtime'] = None row['L2_traffic'] = None row['allruns'] = None row['perfevents'] = None # Mark for IACA, OSACA and LLVM-MCA print("mark", end="", flush=True) try: marked_asmfile, marked_objfile, row['pointer_increment'], overwrite = mark( asm_path, compiler, cflags, isa=ainfo['isa'], overwrite=overwrite) row['marking_error'] = None except ValueError as e: row['marking_error'] = str(e) print(":", e) continue if overwrite: # clear all model generated information for model in ['IACA', 'OSACA', 'LLVM-MCA', 'Ithemal']: for k in ['ports', 'prediction', 'throughput', 'cp', 'lcd', 'raw']: row[model+'_'+k] = None for model in ['IACA', 'OSACA', 'LLVM-MCA', 'Ithemal']: for k in ['ports', 'prediction', 'throughput', 'cp', 'lcd', 'raw']: if model+'_'+k not in row: row[model+'_'+k] = None # Analyze with IACA, if requested and configured if iaca and ainfo['IACA'] is not None: print("IACA", end="", flush=True) if not row.get('IACA_ports'): row['IACA_raw'] = iaca_analyse_instrumented_binary( marked_objfile, micro_architecture=ainfo['IACA']) row['IACA_ports'] = \ {k: v/(row['pointer_increment']/row['element_size']) for k,v in row['IACA_raw']['port cycles'].items()} row['IACA_prediction'] = row['IACA_raw']['throughput']/( row['pointer_increment']/row['element_size']) row['IACA_throughput'] = max(row['IACA_ports'].values()) print(". ", end="", flush=True) else: print("! ", end="", flush=True) # Analyze with OSACA, if requested if osaca: print("OSACA", end="", flush=True) if not row.get('OSACA_ports'): row['OSACA_raw'] = osaca_analyse_instrumented_assembly( marked_asmfile, micro_architecture=ainfo['OSACA'], assign_optimal_throughput=ainfo.get('assign_optimal_throughput', True)) row['OSACA_ports'] = \ {k: v/(row['pointer_increment']/row['element_size']) for k,v in row['OSACA_raw']['port cycles'].items()} row['OSACA_prediction'] = row['OSACA_raw']['throughput']/( row['pointer_increment']/row['element_size']) row['OSACA_throughput'] = max(row['OSACA_ports'].values()) row['OSACA_cp'] = row['OSACA_raw']['cp_latency']/( row['pointer_increment']/row['element_size']) row['OSACA_lcd'] = row['OSACA_raw']['lcd']/( row['pointer_increment']/row['element_size']) print(". ", end="", flush=True) else: print("! ", end="", flush=True) # Analyze with LLVM-MCA, if requested and configured if llvm_mca and ainfo['LLVM-MCA'] is not None: print("LLVM-MCA", end="", flush=True) if not row.get('LLVM-MCA_ports'): row['LLVM-MCA_raw'] = llvm_mca_analyse_instrumented_assembly( marked_asmfile, micro_architecture=ainfo['LLVM-MCA'], isa=ainfo['isa']) row['LLVM-MCA_ports'] = \ {k: v/(row['pointer_increment']/row['element_size']) for k,v in row['LLVM-MCA_raw']['port cycles'].items()} row['LLVM-MCA_prediction'] =row['LLVM-MCA_raw']['throughput']/( row['pointer_increment']/row['element_size']) row['LLVM-MCA_throughput'] = max(row['LLVM-MCA_ports'].values()) row['LLVM-MCA_cp'] = row['LLVM-MCA_raw']['cp_latency']/( row['pointer_increment']/row['element_size']) row['LLVM-MCA_lcd'] = row['LLVM-MCA_raw']['lcd']/( row['pointer_increment']/row['element_size']) print(". ", end="", flush=True) else: print("! ", end="", flush=True) # Analyze with Ithemal, if not running local and configured if ainfo['Ithemal'] is not None and not islocal: print("Ithemal", end="", flush=True) if not row.get('Ithemal_prediction'): with open(marked_asmfile) as f: parsed_code = parse_asm(f.read(), ainfo['isa']) kernel = reduce_to_section(parsed_code, ainfo['isa']) row['Ithemal_prediction'] = get_ithemal_prediction( get_intel_style_code(marked_objfile), model=ainfo['Ithemal']) print(". ", end="", flush=True) else: print("! ", end="", flush=True) if measurements and islocal: # run measurements if on same hardware print("scale", end="", flush=True) if not row.get('allruns'): # find best length with concurrent L2 measurement scaling_runs, best = scalingrun(exec_path) row['best_length'] = best[0] row['best_runtime'] = best[2] row['L2_traffic'] = best[3] row['allruns'] = scaling_runs print(f"({best[0]}). ", end="", flush=True) else: print(f"({row.get('best_length', None)})! ", end="", flush=True) print() # dump to file if data != data_lastsaved: print('saving... ', end="", flush=True) with data_path.open('wb') as f: try: pickle.dump(data, f) data_lastsaved = deepcopy(data) print('saved!') except KeyboardInterrupt: f.seek(0) pickle.dump(data, f) print('saved!') sys.exit()