示例#1
0
文件: osaca.py 项目: jdomke/OSACA
def insert_byte_marker(args):
    """
    Inserts byte markers into an assembly file using kerncraft.

    :param args: arguments given from :class:`~argparse.ArgumentParser` after parsing
    """
    try:
        from kerncraft.incore_model import asm_instrumentation
    except ImportError:
        print(
            'Module kerncraft not installed. Use \'pip install --user '
            'kerncraft\' for installation.\nFor more information see '
            'https://github.com/RRZE-HPC/kerncraft',
            file=sys.stderr,
        )
        sys.exit(1)

    assembly = args.file.read()
    unmarked_assembly = io.StringIO(assembly)
    marked_assembly = io.StringIO()
    asm_instrumentation(
        input_file=unmarked_assembly,
        output_file=marked_assembly,
        block_selection='manual',
        pointer_increment='auto_with_manual_fallback',
        isa=MachineModel.get_isa_for_arch(args.arch),
    )

    marked_assembly.seek(0)
    assembly = marked_assembly.read()
    with open(args.file.name, 'w') as f:
        f.write(assembly)
示例#2
0
文件: osaca.py 项目: jdomke/OSACA
def get_asm_parser(arch) -> BaseParser:
    """
    Helper function to create the right parser for a specific architecture.

    :param arch: architecture code
    :type arch: str
    :returns: :class:`~osaca.parser.BaseParser` object
    """
    isa = MachineModel.get_isa_for_arch(arch)
    if isa == 'x86':
        return ParserX86ATT()
    elif isa == 'aarch64':
        return ParserAArch64()
示例#3
0
文件: test_cli.py 项目: jdomke/OSACA
 def test_architectures(self):
     parser = osaca.create_parser()
     # Run the test kernel for all architectures
     archs = osaca.SUPPORTED_ARCHS
     for arch in archs:
         with self.subTest(micro_arch=arch):
             isa = MachineModel.get_isa_for_arch(arch)
             kernel = 'kernel_{}.s'.format(isa)
             args = parser.parse_args(
                 ['--arch', arch,
                  self._find_test_file(kernel)])
             output = StringIO()
             osaca.run(args, output_file=output)
示例#4
0
    def test_MachineModel_getter(self):
        sample_operands = [{
            'memory': {
                'offset': None,
                'base': {
                    'name': 'r12'
                },
                'index': {
                    'name': 'rcx'
                },
                'scale': 8,
            }
        }]
        self.assertIsNone(
            self.machine_model_csx.get_instruction('GETRESULT',
                                                   sample_operands))
        self.assertIsNone(
            self.machine_model_tx2.get_instruction('GETRESULT',
                                                   sample_operands))

        self.assertEqual(self.machine_model_csx.get_arch(), 'csx')
        self.assertEqual(self.machine_model_tx2.get_arch(), 'tx2')

        self.assertEqual(self.machine_model_csx.get_ISA(), 'x86')
        self.assertEqual(self.machine_model_tx2.get_ISA(), 'aarch64')

        ports_csx = ['0', '0DV', '1', '2', '2D', '3', '3D', '4', '5', '6', '7']
        data_ports_csx = ['2D', '3D']
        self.assertEqual(self.machine_model_csx.get_ports(), ports_csx)
        self.assertEqual(self.machine_model_csx.get_data_ports(),
                         data_ports_csx)

        self.assertFalse(self.machine_model_tx2.has_hidden_loads())

        self.assertEqual(MachineModel.get_isa_for_arch('CSX'), 'x86')
        self.assertEqual(MachineModel.get_isa_for_arch('tX2'), 'aarch64')
        with self.assertRaises(ValueError):
            self.assertIsNone(MachineModel.get_isa_for_arch('THE_MACHINE'))
示例#5
0
    def test_MachineModel_getter(self):
        sample_operands = [{
            "memory": {
                "offset": None,
                "base": {
                    "name": "r12"
                },
                "index": {
                    "name": "rcx"
                },
                "scale": 8,
            }
        }]
        self.assertIsNone(
            self.machine_model_csx.get_instruction("GETRESULT",
                                                   sample_operands))
        self.assertIsNone(
            self.machine_model_tx2.get_instruction("GETRESULT",
                                                   sample_operands))

        self.assertEqual(self.machine_model_csx.get_arch(), "csx")
        self.assertEqual(self.machine_model_tx2.get_arch(), "tx2")

        self.assertEqual(self.machine_model_csx.get_ISA(), "x86")
        self.assertEqual(self.machine_model_tx2.get_ISA(), "aarch64")

        ports_csx = ["0", "0DV", "1", "2", "2D", "3", "3D", "4", "5", "6", "7"]
        data_ports_csx = ["2D", "3D"]
        self.assertEqual(self.machine_model_csx.get_ports(), ports_csx)
        self.assertEqual(self.machine_model_csx.get_data_ports(),
                         data_ports_csx)

        self.assertFalse(self.machine_model_tx2.has_hidden_loads())

        self.assertEqual(MachineModel.get_isa_for_arch("CSX"), "x86")
        self.assertEqual(MachineModel.get_isa_for_arch("tX2"), "aarch64")
        with self.assertRaises(ValueError):
            self.assertIsNone(MachineModel.get_isa_for_arch("THE_MACHINE"))
示例#6
0
文件: osaca.py 项目: jdomke/OSACA
def inspect(args, output_file=sys.stdout):
    """
    Does the actual throughput and critical path analysis of OSACA and prints it to the
    terminal.

    :param args: arguments given from :class:`~argparse.ArgumentParser` after parsing
    :param output_file: Define the stream for output, defaults to :class:`sys.stdout`
    :type output_file: stream, optional
    """
    # Read file
    code = args.file.read()

    # Detect ISA if necessary
    arch = args.arch if args.arch is not None else DEFAULT_ARCHS[BaseParser.detect_ISA(code)]
    print_arch_warning = False if args.arch else True
    isa = MachineModel.get_isa_for_arch(arch)
    verbose = args.verbose
    ignore_unknown = args.ignore_unknown

    # Parse file
    parser = get_asm_parser(arch)
    try:
        parsed_code = parser.parse_file(code)
    except:
        # probably the wrong parser based on heuristic
        if args.arch is None:
            # change ISA and try again
            arch = DEFAULT_ARCHS['x86'] if BaseParser.detect_ISA(code) == 'aarch64' else DEFAULT_ARCHS['aarch64']
            isa = MachineModel.get_isa_for_arch(arch)
            parser = get_asm_parser(arch)
            parsed_code = parser.parse_file(code)
        else:
            traceback.print_exc(file=sys.stderr)
            sys.exit(1)

    # Reduce to marked kernel or chosen section and add semantics
    if args.lines:
        line_range = get_line_range(args.lines)
        kernel = [line for line in parsed_code if line['line_number'] in line_range]
        print_length_warning = False
    else:
        kernel = reduce_to_section(parsed_code, isa)
        # Print warning if kernel has no markers and is larger than threshold (100)
        print_length_warning = True if len(kernel) == len(parsed_code) and len(kernel) > 100 else False
    machine_model = MachineModel(arch=arch)
    semantics = ArchSemantics(machine_model)
    semantics.add_semantics(kernel)
    # Do optimal schedule for kernel throughput if wished
    if not args.fixed:
        semantics.assign_optimal_throughput(kernel)

    # Create DiGrahps
    kernel_graph = KernelDG(kernel, parser, machine_model)
    if args.dotpath is not None:
        kernel_graph.export_graph(args.dotpath if args.dotpath != '.' else None)
    # Print analysis
    frontend = Frontend(args.file.name, arch=arch)
    print(
        frontend.full_analysis(
            kernel,
            kernel_graph,
            ignore_unknown=ignore_unknown,
            arch_warning=print_arch_warning,
            length_warning=print_length_warning,
            verbose=verbose
        ),
        file=output_file,
    )
示例#7
0
def extract_model(tree, arch, skip_mem=True):
    try:
        isa = MachineModel.get_isa_for_arch(arch)
    except Exception:
        print("Skipping...", file=sys.stderr)
        return None
    mm = MachineModel(isa=isa)
    parser = get_parser(isa)

    for instruction_tag in tree.findall('.//instruction'):
        ignore = False

        mnemonic = instruction_tag.attrib['asm']
        iform = instruction_tag.attrib['iform']
        # skip any mnemonic which contain spaces (e.g., "REX CRC32")
        if ' ' in mnemonic:
            continue

        # Extract parameter components
        try:
            parameters = extract_paramters(instruction_tag, parser, isa)
            if isa == 'x86':
                parameters.reverse()
        except ValueError as e:
            print(e, file=sys.stderr)

        # Extract port occupation, throughput and latency
        port_pressure, throughput, latency, uops = [], None, None, None
        arch_tag = instruction_tag.find('architecture[@name="' + arch.upper() + '"]')
        if arch_tag is None:
            continue
        # skip any instructions without port utilization
        if not any(['ports' in x.attrib for x in arch_tag.findall('measurement')]):
            print("Couldn't find port utilization, skip: ", iform, file=sys.stderr)
            continue
        # skip if computed and measured TP don't match
        if not [x.attrib['TP_ports'] == x.attrib['TP'] for x in arch_tag.findall('measurement')][
            0
        ]:
            print(
                "Calculated TP from port utilization doesn't match TP, skip: ",
                iform,
                file=sys.stderr,
            )
            continue
        # skip if instruction contains memory operand
        if skip_mem and any(
            [x.attrib['type'] == 'mem' for x in instruction_tag.findall('operand')]
        ):
            print("Contains memory operand, skip: ", iform, file=sys.stderr)
            continue
        # We collect all measurement and IACA information and compare them later
        for measurement_tag in arch_tag.iter('measurement'):
            if 'TP_ports' in measurement_tag.attrib:
                throughput = measurement_tag.attrib['TP_ports']
            else:
                throughput = (
                    measurement_tag.attrib['TP'] if 'TP' in measurement_tag.attrib else None
                )
            uops = (
                int(measurement_tag.attrib['uops']) if 'uops' in measurement_tag.attrib else None
            )
            if 'ports' in measurement_tag.attrib:
                port_pressure.append(port_pressure_from_tag_attributes(measurement_tag.attrib))
            latencies = [
                int(l_tag.attrib['cycles'])
                for l_tag in measurement_tag.iter('latency')
                if 'cycles' in l_tag.attrib
            ]
            if len(latencies) == 0:
                latencies = [
                    int(l_tag.attrib['max_cycles'])
                    for l_tag in measurement_tag.iter('latency')
                    if 'max_cycles' in l_tag.attrib
                ]
            if latencies[1:] != latencies[:-1]:
                print(
                    "Contradicting latencies found, using smallest:",
                    iform,
                    latencies,
                    file=sys.stderr,
                )
            if latencies:
                latency = min(latencies)
        if ignore:
            continue

        # Ordered by IACA version (newest last)
        for iaca_tag in sorted(
            arch_tag.iter('IACA'), key=lambda i: StrictVersion(i.attrib['version'])
        ):
            if 'ports' in iaca_tag.attrib:
                port_pressure.append(port_pressure_from_tag_attributes(iaca_tag.attrib))

        # Check if all are equal
        if port_pressure:
            if port_pressure[1:] != port_pressure[:-1]:
                print("Contradicting port occupancies, using latest IACA:", iform, file=sys.stderr)
            port_pressure = port_pressure[-1]
        else:
            # print("No data available for this architecture:", mnemonic, file=sys.stderr)
            continue

        # Adding Intel's 2D and 3D pipelines on Intel µarchs, without Ice Lake:
        if arch.upper() in intel_archs and not arch.upper() in ['ICL']:
            if any([p['class'] == 'memory' for p in parameters]):
                # We have a memory parameter, if ports 2 & 3 are present, also add 2D & 3D
                # TODO remove port7 on 'hsw' onward and split entries depending on addressing mode
                port_23 = False
                port_4 = False
                for i, pp in enumerate(port_pressure):
                    if '2' in pp[1] and '3' in pp[1]:
                        port_23 = True
                    if '4' in pp[1]:
                        port_4 = True
                # Add (X, ['2D', '3D']) if load ports (2 & 3) are used, but not the store port (4)
                # X = 2 on SNB and IVB IFF used in combination with ymm register, otherwise X = 1
                if arch.upper() in ['SNB', 'IVB'] and \
                    any([p['class'] == 'register' and p['name'] == 'ymm' for p in parameters]):
                    data_port_throughput = 2
                else:
                    data_port_throughput = 1
                if port_23 and not port_4:
                    port_pressure.append((data_port_throughput, ['2D', '3D']))

        # Add missing ports:
        for ports in [pp[1] for pp in port_pressure]:
            for p in ports:
                mm.add_port(p)

        throughput = max(mm.average_port_pressure(port_pressure))

        mm.set_instruction(mnemonic, parameters, latency, port_pressure, throughput, uops)
    # TODO eliminate entries which could be covered by automatic load / store expansion
    return mm
示例#8
0
def extract_model(tree, arch, skip_mem=True):
    try:
        isa = MachineModel.get_isa_for_arch(arch)
    except Exception:
        print("Skipping...", file=sys.stderr)
        return None
    mm = MachineModel(isa=isa)
    parser = get_parser(isa)

    for instruction_tag in tree.findall(".//instruction"):
        ignore = False

        mnemonic = instruction_tag.attrib["asm"]
        iform = instruction_tag.attrib["iform"]
        # reduce to second part if mnemonic contain space (e.g., "REX CRC32")
        if " " in mnemonic:
            mnemonic = mnemonic.split(" ", 1)[1]

        # Extract parameter components
        try:
            parameters = extract_paramters(instruction_tag, parser, isa)
            if isa == "x86":
                parameters.reverse()
        except ValueError as e:
            print(e, file=sys.stderr)

        # Extract port occupation, throughput and latency
        port_pressure, throughput, latency, uops = [], None, None, None
        arch_tag = instruction_tag.find('architecture[@name="' + arch.upper() + '"]')
        if arch_tag is None:
            continue
        # skip any instructions without port utilization
        if not any(["ports" in x.attrib for x in arch_tag.findall("measurement")]):
            print("Couldn't find port utilization, skip: ", iform, file=sys.stderr)
            continue
        # skip if measured TP is smaller than computed
        if [float(x.attrib["TP_ports"]) > min(float(x.attrib["TP_loop"]),
                                              float(x.attrib["TP_unrolled"]))
                for x in arch_tag.findall("measurement")][0]:
            print(
                "Calculated TP is greater than measured TP.",
                iform,
                file=sys.stderr,
            )
        # skip if instruction contains memory operand
        if skip_mem and any(
            [x.attrib["type"] == "mem" for x in instruction_tag.findall("operand")]
        ):
            print("Contains memory operand, skip: ", iform, file=sys.stderr)
            continue
        # We collect all measurement and IACA information and compare them later
        for measurement_tag in arch_tag.iter("measurement"):
            if "TP_ports" in measurement_tag.attrib:
                throughput = float(measurement_tag.attrib["TP_ports"])
            else:
                throughput = min(
                    measurement_tag.attrib.get("TP_loop", float('inf')),
                    measurement_tag.attrib.get("TP_unroll", float('inf')),
                    measurement_tag.attrib.get("TP", float('inf')),
                )
                if throughput == float('inf'):
                    throughput = None
            uops = int(measurement_tag.attrib["uops"]) if "uops" in measurement_tag.attrib else None
            if "ports" in measurement_tag.attrib:
                port_pressure.append(port_pressure_from_tag_attributes(measurement_tag.attrib))
            latencies = [
                int(l_tag.attrib["cycles"])
                for l_tag in measurement_tag.iter("latency")
                if "cycles" in l_tag.attrib
            ]
            if len(latencies) == 0:
                latencies = [
                    int(l_tag.attrib["max_cycles"])
                    for l_tag in measurement_tag.iter("latency")
                    if "max_cycles" in l_tag.attrib
                ]
            if latencies[1:] != latencies[:-1]:
                print(
                    "Contradicting latencies found, using smallest:",
                    iform,
                    latencies,
                    file=sys.stderr,
                )
            if latencies:
                latency = min(latencies)
        if ignore:
            continue

        # Ordered by IACA version (newest last)
        for iaca_tag in sorted(
            arch_tag.iter("IACA"), key=lambda i: StrictVersion(i.attrib["version"])
        ):
            if "ports" in iaca_tag.attrib:
                port_pressure.append(port_pressure_from_tag_attributes(iaca_tag.attrib))

        # Check if all are equal
        if port_pressure:
            if port_pressure[1:] != port_pressure[:-1]:
                print("Contradicting port occupancies, using latest IACA:", iform, file=sys.stderr)
            port_pressure = port_pressure[-1]
        else:
            # print("No data available for this architecture:", mnemonic, file=sys.stderr)
            continue

        # Adding Intel's 2D and 3D pipelines on Intel µarchs, without Ice Lake:
        if arch.upper() in intel_archs and not arch.upper() in ["ICL"]:
            if any([p["class"] == "memory" for p in parameters]):
                # We have a memory parameter, if ports 2 & 3 are present, also add 2D & 3D
                # TODO remove port7 on 'hsw' onward and split entries depending on addressing mode
                port_23 = False
                port_4 = False
                for i, pp in enumerate(port_pressure):
                    if "2" in pp[1] and "3" in pp[1]:
                        port_23 = True
                    if "4" in pp[1]:
                        port_4 = True
                # Add (x, ['2D', '3D']) if load ports (2 & 3) are used, but not the store port (4)
                if port_23 and not port_4:
                    if arch.upper() in ["SNB", "IVB"] and any(
                            [p.get('name', '') == 'ymm' for p in parameters]) and \
                            not '128' in mnemonic:
                        # x = 2 if SNB or IVB and ymm regiser in any operand and not '128' in 
                        # instruction name
                        port2D3D_pressure = 2
                    else:
                        # otherwiese x = 1
                        port2D3D_pressure = 1
                    port_pressure.append((port2D3D_pressure, ["2D", "3D"]))

        # Add missing ports:
        for ports in [pp[1] for pp in port_pressure]:
            for p in ports:
                mm.add_port(p)

        throughput = max(mm.average_port_pressure(port_pressure))
        mm.set_instruction(mnemonic, parameters, latency, port_pressure, throughput, uops)
    # TODO eliminate entries which could be covered by automatic load / store expansion
    return mm