Пример #1
0
def main():
    hl.load_plugin("autoschedule_li2018")

    x = hl.Var('x')
    f_in = hl.Func('in')
    f_in[x] = hl.f32(x)  # Cast to float 32
    f_0 = hl.Func('f_0')
    f_0[x] = 2 * f_in[x]
    f_1 = hl.Func('f_1')
    f_1[x] = hl.sin(f_0[x])
    f_2 = hl.Func('f_2')
    f_2[x] = f_1[x] * f_1[x]

    # Setup
    f_2.set_estimate(x, 0, 1000)
    p = hl.Pipeline(f_2)
    target = hl.Target()
    # Only first parameter is used (number of cores on CPU)
    params = hl.MachineParams(32, 0, 0)
    result = p.auto_schedule('Li2018', target, params)
    print('Schedule:')
    print(result.schedule_source)

    p.compile_jit()  # compile
    buf = p.realize(1000)  # compute and get the buffer
Пример #2
0
def gen_twoel(zone_name, **kwargs):

    # get JIT pipeline
    zone_names = zone_name.split(",")
    myzones = []
    for zone in zones.loops:
        if zone_name == 'all' or zone.name in zone_names:
            myzones.append(zone)
    if len(myzones) == 0:
        if zone_name == 'list':
            print([z['name'] for z in zones.loops])
        else:
            print("no zone %s found" % zone_name)
        exit(1)
    if "target_name" in kwargs:
        target_name = kwargs["target_name"]
        del kwargs["target_name"]
    else:
        target_name = "x86-64-linux-avx-avx2-f16c-fma-sse41-profile-disable_llvm_loop_opt"
    zones.loops = myzones
    gen = twoel_gen.Generate_twoel(loopnests=zones, **kwargs)
    gen.generate_twoel()
    p = gen.pipeline
    print("generating for target", target_name)
    target = hl.Target(target_name)
    p.compile_to(
        {
            hl.Output.c_header: "twoel.h",
            hl.Output.c_source: "twoel.cpp",
            hl.Output.static_library: "twoel.a",
            hl.Output.stmt: "twoel.stmt",
            hl.Output.stmt_html: "twoel.html",
            # the following outputs are useful for running it from python
            #hl.Output.object: "twoel.o",
            #hl.Output.python_extension: "twoel.py.cpp",
        },
        list(gen.inputs.values()),
        "twoel",
        target)
Пример #3
0
def call_twoel(zone_name,
               seed=2,
               datasize=15,
               itercount=10,
               target_name="host-disable_llvm_loop_opt",
               **kwargs):
    N = datasize
    seed = 2

    inputs = [
        {
            "name": "delo2",
            "d": 0,
            "value": 0.001
        },
        {
            "name": "delta",
            "d": 0,
            "value": 0.001
        },
        {
            "name": "rdelta",
            "d": 0,
            "value": 0.001
        },
        {
            "name": "expnt",
            "d": 1,
            "value": 0.00001
        },
        {
            "name": "rnorm",
            "d": 1
        },
        {
            "name": "x",
            "d": 1
        },
        {
            "name": "y",
            "d": 1
        },
        {
            "name": "z",
            "d": 1
        },
        {
            "name": "fm",
            "d": 2,
            "shape": [1002, 5]
        },
        {
            "name": "g_fock",
            "d": 2
        },
        {
            "name": "g_dens",
            "d": 2
        },
        {
            "name": "g_trace",
            "d": 4,
            "value": 0.0
        },
    ]

    outputs = [
        {
            "name": "rv",
            "d": 1,
            "shape": [1]
        },
        {
            "name": "g_fock",
            "d": 2
        },
    ]

    inputs = {x["name"]: x for x in inputs}
    outputs = {x["name"]: x for x in outputs}

    # generate input data
    print("input/output size is", N, "^2")
    buffers = []
    buffers_by_name = {}
    np.random.seed(seed)
    for key in inputs:
        param = inputs[key]
        if param['d'] == 0:
            thing = 0.2
        else:
            shape = [N] * param['d']
            if 'shape' in param:
                shape = param['shape']
            thing = hl.Buffer(hl.Float(64), shape, name=key)
            if 'value' in param:
                if param['value'] != 0.0:
                    for pos in np.ndindex(*shape):
                        thing[pos] = param['value']
            else:
                values = np.random.rand(*shape) - 0.5
                for pos in np.ndindex(*shape):
                    thing[pos] = values[pos]
        buffers.append(thing)
        buffers_by_name[key] = thing

    # get JIT pipeline
    zones = twoel_gen.define_original_twoel_zone().split_recursive()
    zone_names = zone_name.split(",")
    myzones = []
    for zone in zones.loops:
        if zone_name == 'all' or zone['name'] in zone_names:
            myzones.append(zone)
    if len(myzones) == 0:
        if zone_name == 'list':
            print([z.name for z in zones])
        else:
            print("no zone %s found" % zone_name)
        exit(1)
    zones.loops = myzones
    gen = twoel_gen.Generate_twoel(loopnests=zones, **kwargs)
    gen.generate_twoel()
    p = gen.pipeline
    target = hl.Target(target_name)
    zone_names = [z.name for z in myzones]
    print("compiling zones", zone_names, "for target", target)
    p.compile_jit(target)
    # plug in the parameter values
    for param in gen.inputs.values():
        name = param.name()
        if name in buffers_by_name:
            thing = buffers_by_name[name]
        elif name.endswith("_in") and name[:-3] in buffers_by_name:
            name = name[:-3]
            thing = buffers_by_name[name]
        else:
            raise KeyError(name)
        param.set(thing)

    # dry-run
    p.realize(N, N)
    print(itercount, "timed runs")

    if itercount == 0:
        # when generating trace output, just doing the dry-run is enough.
        return 0.0, 0.0

    # benchmark it
    walltime = 0.0
    cputime = 0.0
    for _ in range(itercount):
        cpu_before = time.process_time()
        wall_before = time.time()

        rv, g_fock_out = p.realize(N, N)

        cpu_after = time.process_time()
        wall_after = time.time()

        walltime += wall_after - wall_before
        cputime += cpu_after - cpu_before
    print("walltime: %.3f" % walltime)
    print("cputime: %.3f" % cputime)
    walltime /= itercount
    cputime /= itercount
    print("walltime per iter: %.3e" % walltime)
    print("cputime per iter: %.3e" % cputime)
    throughput = N * N * N * N / walltime
    print("throughput: %.3e g() calls per second (roughly)" % throughput)
    rv = rv[0]
    g_fock_out = np.array(g_fock_out)
    return walltime, cputime
Пример #4
0
def test_target():
    # Target("") should be exactly like get_host_target().
    t1 = hl.get_host_target()
    t2 = hl.Target("")
    assert t1 == t2, "Default ctor failure"
    assert t1.supported()

    # to_string roundtripping
    t1 = hl.Target()
    ts = t1.to_string()
    assert ts == "arch_unknown-0-os_unknown"

    # Note, this should *not* validate, since validate_target_string
    # now returns false if any of arch-bits-os are undefined
    assert not hl.Target.validate_target_string(ts)

    # Don't attempt to roundtrip this: trying to create
    # a Target with unknown portions will now assert-fail.
    #
    # t2 = hl.Target(ts)
    # assert t2 == t1

    # repr() and str()
    assert str(t1) == "arch_unknown-0-os_unknown"
    assert repr(t1) == "<halide.Target arch_unknown-0-os_unknown>"

    assert t1.os == hl.TargetOS.OSUnknown
    assert t1.arch == hl.TargetArch.ArchUnknown
    assert t1.bits == 0

    # Full specification round-trip:
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    ts = t1.to_string()
    assert ts == "x86-32-linux-sse41"
    assert hl.Target.validate_target_string(ts)

    # Full specification (without features) round-trip:
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32)
    ts = t1.to_string()
    assert ts == "x86-32-linux"
    assert hl.Target.validate_target_string(ts)

    # Full specification round-trip, crazy features
    t1 = hl.Target(hl.TargetOS.Android, hl.TargetArch.ARM, 32, [
        hl.TargetFeature.JIT, hl.TargetFeature.SSE41, hl.TargetFeature.AVX,
        hl.TargetFeature.AVX2, hl.TargetFeature.CUDA, hl.TargetFeature.OpenCL,
        hl.TargetFeature.OpenGL, hl.TargetFeature.OpenGLCompute,
        hl.TargetFeature.Debug
    ])
    ts = t1.to_string()
    assert ts == "arm-32-android-avx-avx2-cuda-debug-jit-opencl-opengl-openglcompute-sse41"
    assert hl.Target.validate_target_string(ts)

    # Expected failures:
    ts = "host-unknowntoken"
    assert not hl.Target.validate_target_string(ts)

    ts = "x86-23"
    assert not hl.Target.validate_target_string(ts)

    # bits == 0 is allowed only if arch_unknown and os_unknown are specified,
    # and no features are set
    ts = "x86-0"
    assert not hl.Target.validate_target_string(ts)

    ts = "0-arch_unknown-os_unknown-sse41"
    assert not hl.Target.validate_target_string(ts)

    # "host" is only supported as the first token
    ts = "opencl-host"
    assert not hl.Target.validate_target_string(ts)

    # set_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    assert t1.has_feature(hl.TargetFeature.SSE41)
    assert not t1.has_feature(hl.TargetFeature.AVX)
    t1.set_feature(hl.TargetFeature.AVX)
    t1.set_feature(hl.TargetFeature.SSE41, False)
    assert t1.has_feature(hl.TargetFeature.AVX)
    assert not t1.has_feature(hl.TargetFeature.SSE41)

    # set_features
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    assert t1.has_feature(hl.TargetFeature.SSE41)
    assert not t1.has_feature(hl.TargetFeature.AVX)
    t1.set_features([hl.TargetFeature.SSE41], False)
    t1.set_features([hl.TargetFeature.AVX, hl.TargetFeature.AVX2], True)
    assert t1.has_feature(hl.TargetFeature.AVX)
    assert t1.has_feature(hl.TargetFeature.AVX2)
    assert not t1.has_feature(hl.TargetFeature.SSE41)

    # with_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    t2 = t1.with_feature(hl.TargetFeature.NoAsserts).with_feature(
        hl.TargetFeature.NoBoundsQuery)
    ts = t2.to_string()
    assert ts == "x86-32-linux-no_asserts-no_bounds_query-sse41"

    # without_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41, hl.TargetFeature.NoAsserts])
    # Note that NoBoundsQuery wasn't set here, so 'without' is a no-op
    t2 = t1.without_feature(hl.TargetFeature.NoAsserts).without_feature(
        hl.TargetFeature.NoBoundsQuery)
    ts = t2.to_string()
    assert ts == "x86-32-linux-sse41"

    # natural_vector_size
    # SSE4.1 is 16 bytes wide
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.SSE41])
    assert t1.natural_vector_size(hl.UInt(8)) == 16
    assert t1.natural_vector_size(hl.Int(16)) == 8
    assert t1.natural_vector_size(hl.UInt(32)) == 4
    assert t1.natural_vector_size(hl.Float(32)) == 4

    # has_gpu_feature
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                   [hl.TargetFeature.OpenCL])
    t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [])
    assert t1.has_gpu_feature()
    assert not t2.has_gpu_feature()

    # has_large_buffers & maximum_buffer_size
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64,
                   [hl.TargetFeature.LargeBuffers])
    t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [])
    assert t1.has_large_buffers()
    assert t1.maximum_buffer_size() == 9223372036854775807
    assert not t2.has_large_buffers()
    assert t2.maximum_buffer_size() == 2147483647

    # supports_device_api
    t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64,
                   [hl.TargetFeature.CUDA])
    t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64)
    assert t1.supports_device_api(hl.DeviceAPI.CUDA)
    assert not t2.supports_device_api(hl.DeviceAPI.CUDA)

    # supports_type (deprecated version)
    t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64,
                   [hl.TargetFeature.Metal])
    t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64)
    assert not t1.supports_type(hl.Float(64))
    assert t2.supports_type(hl.Float(64))

    # supports_type (preferred version)
    t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64,
                   [hl.TargetFeature.Metal])
    t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64)
    assert not t1.supports_type(hl.Float(64), hl.DeviceAPI.Metal)
    assert not t2.supports_type(hl.Float(64), hl.DeviceAPI.Metal)

    # target_feature_for_device_api
    assert hl.target_feature_for_device_api(
        hl.DeviceAPI.OpenCL) == hl.TargetFeature.OpenCL

    # with_feature with non-convertible lists
    try:
        t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32,
                       ["this is a string"])
    except TypeError as e:
        assert "incompatible constructor arguments" in str(e)
    else:
        assert False, 'Did not see expected exception!'
Пример #5
0
def main():

    # We'll define the simple one-stage pipeline that we used in lesson 10.
    brighter = hl.Func("brighter")
    x, y = hl.Var("x"), hl.Var("y")

    # Declare the arguments.
    offset = hl.Param(hl.UInt(8))
    input = hl.ImageParam(hl.UInt(8), 2)
    args = [input, offset]

    # Define the hl.Func.
    brighter[x, y] = input[x, y] + offset

    # Schedule it.
    brighter.vectorize(x, 16).parallel(y)

    # The following line is what we did in lesson 10. It compiles an
    # object file suitable for the system that you're running this
    # program on.  For example, if you compile and run this file on
    # 64-bit linux on an x86 cpu with sse4.1, then the generated code
    # will be suitable for 64-bit linux on x86 with sse4.1.
    brighter.compile_to_file("lesson_11_host", args, "lesson_11_host")

    # We can also compile object files suitable for other cpus and
    # operating systems. You do this with an optional third argument
    # to compile_to_file which specifies the target to compile for.

    create_android = True
    create_windows = True
    create_ios = True

    if create_android:
        # Let's use this to compile a 32-bit arm android version of this code:
        target = hl.Target()
        target.os = hl.TargetOS.Android  # The operating system
        target.arch = hl.TargetArch.ARM  # The CPU architecture
        target.bits = 32  # The bit-width of the architecture
        arm_features = []  # A list of features to set
        target.set_features(arm_features)
        # Pass the target as the last argument.
        brighter.compile_to_file("lesson_11_arm_32_android", args,
                                 "lesson_11_arm_32_android", target)

    if create_windows:
        # And now a Windows object file for 64-bit x86 with AVX and SSE 4.1:
        target = hl.Target()
        target.os = hl.TargetOS.Windows
        target.arch = hl.TargetArch.X86
        target.bits = 64
        target.set_features([hl.TargetFeature.AVX, hl.TargetFeature.SSE41])
        brighter.compile_to_file("lesson_11_x86_64_windows", args,
                                 "lesson_11_x86_64_windows", target)

    if create_ios:
        # And finally an iOS mach-o object file for one of Apple's 32-bit
        # ARM processors - the A6. It's used in the iPhone 5. The A6 uses
        # a slightly modified ARM architecture called ARMv7s. We specify
        # this using the target features field.  Support for Apple's
        # 64-bit ARM processors is very new in llvm, and still somewhat
        # flaky.
        target = hl.Target()
        target.os = hl.TargetOS.IOS
        target.arch = hl.TargetArch.ARM
        target.bits = 32
        target.set_features([hl.TargetFeature.ARMv7s])
        brighter.compile_to_file("lesson_11_arm_32_ios", args,
                                 "lesson_11_arm_32_ios", target)

    # Now let's check these files are what they claim, by examining
    # their first few bytes.

    if create_android:
        # 32-arm android object files start with the magic bytes:
        # uint8_t []
        arm_32_android_magic = [
            0x7f,
            ord('E'),
            ord('L'),
            ord('F'),  # ELF format
            1,  # 32-bit
            1,  # 2's complement little-endian
            1
        ]  # Current version of elf

        length = len(arm_32_android_magic)
        f = open("lesson_11_arm_32_android.o", "rb")
        try:
            header_bytes = f.read(length)
        except:
            print("Android object file not generated")
            return -1
        f.close()

        header = list(unpack("B" * length, header_bytes))
        if header != arm_32_android_magic:
            print([x == y for x, y in zip(header, arm_32_android_magic)])
            raise Exception(
                "Unexpected header bytes in 32-bit arm object file.")
            return -1

    if create_windows:
        # 64-bit windows object files start with the magic 16-bit value 0x8664
        # (presumably referring to x86-64)
        # uint8_t  []
        win_64_magic = [0x64, 0x86]

        f = open("lesson_11_x86_64_windows.obj", "rb")
        try:
            header_bytes = f.read(2)
        except:
            print("Windows object file not generated")
            return -1
        f.close()

        header = list(unpack("B" * 2, header_bytes))
        if header != win_64_magic:
            raise Exception(
                "Unexpected header bytes in 64-bit windows object file.")
            return -1

    if create_ios:
        # 32-bit arm iOS mach-o files start with the following magic bytes:
        #  uint32_t []
        arm_32_ios_magic = [
            0xfeedface,  # Mach-o magic bytes
            #0xfe, 0xed, 0xfa, 0xce, # Mach-o magic bytes
            12,  # CPU type is ARM
            11,  # CPU subtype is ARMv7s
            1
        ]  # It's a relocatable object file.
        f = open("lesson_11_arm_32_ios.o", "rb")
        try:
            header_bytes = f.read(4 * 4)
        except:
            print("ios object file not generated")
            return -1
        f.close()

        header = list(unpack("I" * 4, header_bytes))
        if header != arm_32_ios_magic:
            raise Exception(
                "Unexpected header bytes in 32-bit arm ios object file.")
            return -1

    # It looks like the object files we produced are plausible for
    # those targets. We'll count that as a success for the purposes
    # of this tutorial. For a real application you'd then need to
    # figure out how to integrate Halide into your cross-compilation
    # toolchain. There are several small examples of this in the
    # Halide repository under the apps folder. See HelloAndroid and
    # HelloiOS here:
    # https:#github.com/halide/Halide/tree/master/apps/
    print("Success!")
    return 0