def main(): hl.load_plugin("autoschedule_li2018") x = hl.Var('x') f_in = hl.Func('in') f_in[x] = hl.f32(x) # Cast to float 32 f_0 = hl.Func('f_0') f_0[x] = 2 * f_in[x] f_1 = hl.Func('f_1') f_1[x] = hl.sin(f_0[x]) f_2 = hl.Func('f_2') f_2[x] = f_1[x] * f_1[x] # Setup f_2.set_estimate(x, 0, 1000) p = hl.Pipeline(f_2) target = hl.Target() # Only first parameter is used (number of cores on CPU) params = hl.MachineParams(32, 0, 0) result = p.auto_schedule('Li2018', target, params) print('Schedule:') print(result.schedule_source) p.compile_jit() # compile buf = p.realize(1000) # compute and get the buffer
def gen_twoel(zone_name, **kwargs): # get JIT pipeline zone_names = zone_name.split(",") myzones = [] for zone in zones.loops: if zone_name == 'all' or zone.name in zone_names: myzones.append(zone) if len(myzones) == 0: if zone_name == 'list': print([z['name'] for z in zones.loops]) else: print("no zone %s found" % zone_name) exit(1) if "target_name" in kwargs: target_name = kwargs["target_name"] del kwargs["target_name"] else: target_name = "x86-64-linux-avx-avx2-f16c-fma-sse41-profile-disable_llvm_loop_opt" zones.loops = myzones gen = twoel_gen.Generate_twoel(loopnests=zones, **kwargs) gen.generate_twoel() p = gen.pipeline print("generating for target", target_name) target = hl.Target(target_name) p.compile_to( { hl.Output.c_header: "twoel.h", hl.Output.c_source: "twoel.cpp", hl.Output.static_library: "twoel.a", hl.Output.stmt: "twoel.stmt", hl.Output.stmt_html: "twoel.html", # the following outputs are useful for running it from python #hl.Output.object: "twoel.o", #hl.Output.python_extension: "twoel.py.cpp", }, list(gen.inputs.values()), "twoel", target)
def call_twoel(zone_name, seed=2, datasize=15, itercount=10, target_name="host-disable_llvm_loop_opt", **kwargs): N = datasize seed = 2 inputs = [ { "name": "delo2", "d": 0, "value": 0.001 }, { "name": "delta", "d": 0, "value": 0.001 }, { "name": "rdelta", "d": 0, "value": 0.001 }, { "name": "expnt", "d": 1, "value": 0.00001 }, { "name": "rnorm", "d": 1 }, { "name": "x", "d": 1 }, { "name": "y", "d": 1 }, { "name": "z", "d": 1 }, { "name": "fm", "d": 2, "shape": [1002, 5] }, { "name": "g_fock", "d": 2 }, { "name": "g_dens", "d": 2 }, { "name": "g_trace", "d": 4, "value": 0.0 }, ] outputs = [ { "name": "rv", "d": 1, "shape": [1] }, { "name": "g_fock", "d": 2 }, ] inputs = {x["name"]: x for x in inputs} outputs = {x["name"]: x for x in outputs} # generate input data print("input/output size is", N, "^2") buffers = [] buffers_by_name = {} np.random.seed(seed) for key in inputs: param = inputs[key] if param['d'] == 0: thing = 0.2 else: shape = [N] * param['d'] if 'shape' in param: shape = param['shape'] thing = hl.Buffer(hl.Float(64), shape, name=key) if 'value' in param: if param['value'] != 0.0: for pos in np.ndindex(*shape): thing[pos] = param['value'] else: values = np.random.rand(*shape) - 0.5 for pos in np.ndindex(*shape): thing[pos] = values[pos] buffers.append(thing) buffers_by_name[key] = thing # get JIT pipeline zones = twoel_gen.define_original_twoel_zone().split_recursive() zone_names = zone_name.split(",") myzones = [] for zone in zones.loops: if zone_name == 'all' or zone['name'] in zone_names: myzones.append(zone) if len(myzones) == 0: if zone_name == 'list': print([z.name for z in zones]) else: print("no zone %s found" % zone_name) exit(1) zones.loops = myzones gen = twoel_gen.Generate_twoel(loopnests=zones, **kwargs) gen.generate_twoel() p = gen.pipeline target = hl.Target(target_name) zone_names = [z.name for z in myzones] print("compiling zones", zone_names, "for target", target) p.compile_jit(target) # plug in the parameter values for param in gen.inputs.values(): name = param.name() if name in buffers_by_name: thing = buffers_by_name[name] elif name.endswith("_in") and name[:-3] in buffers_by_name: name = name[:-3] thing = buffers_by_name[name] else: raise KeyError(name) param.set(thing) # dry-run p.realize(N, N) print(itercount, "timed runs") if itercount == 0: # when generating trace output, just doing the dry-run is enough. return 0.0, 0.0 # benchmark it walltime = 0.0 cputime = 0.0 for _ in range(itercount): cpu_before = time.process_time() wall_before = time.time() rv, g_fock_out = p.realize(N, N) cpu_after = time.process_time() wall_after = time.time() walltime += wall_after - wall_before cputime += cpu_after - cpu_before print("walltime: %.3f" % walltime) print("cputime: %.3f" % cputime) walltime /= itercount cputime /= itercount print("walltime per iter: %.3e" % walltime) print("cputime per iter: %.3e" % cputime) throughput = N * N * N * N / walltime print("throughput: %.3e g() calls per second (roughly)" % throughput) rv = rv[0] g_fock_out = np.array(g_fock_out) return walltime, cputime
def test_target(): # Target("") should be exactly like get_host_target(). t1 = hl.get_host_target() t2 = hl.Target("") assert t1 == t2, "Default ctor failure" assert t1.supported() # to_string roundtripping t1 = hl.Target() ts = t1.to_string() assert ts == "arch_unknown-0-os_unknown" # Note, this should *not* validate, since validate_target_string # now returns false if any of arch-bits-os are undefined assert not hl.Target.validate_target_string(ts) # Don't attempt to roundtrip this: trying to create # a Target with unknown portions will now assert-fail. # # t2 = hl.Target(ts) # assert t2 == t1 # repr() and str() assert str(t1) == "arch_unknown-0-os_unknown" assert repr(t1) == "<halide.Target arch_unknown-0-os_unknown>" assert t1.os == hl.TargetOS.OSUnknown assert t1.arch == hl.TargetArch.ArchUnknown assert t1.bits == 0 # Full specification round-trip: t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) ts = t1.to_string() assert ts == "x86-32-linux-sse41" assert hl.Target.validate_target_string(ts) # Full specification (without features) round-trip: t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32) ts = t1.to_string() assert ts == "x86-32-linux" assert hl.Target.validate_target_string(ts) # Full specification round-trip, crazy features t1 = hl.Target(hl.TargetOS.Android, hl.TargetArch.ARM, 32, [ hl.TargetFeature.JIT, hl.TargetFeature.SSE41, hl.TargetFeature.AVX, hl.TargetFeature.AVX2, hl.TargetFeature.CUDA, hl.TargetFeature.OpenCL, hl.TargetFeature.OpenGL, hl.TargetFeature.OpenGLCompute, hl.TargetFeature.Debug ]) ts = t1.to_string() assert ts == "arm-32-android-avx-avx2-cuda-debug-jit-opencl-opengl-openglcompute-sse41" assert hl.Target.validate_target_string(ts) # Expected failures: ts = "host-unknowntoken" assert not hl.Target.validate_target_string(ts) ts = "x86-23" assert not hl.Target.validate_target_string(ts) # bits == 0 is allowed only if arch_unknown and os_unknown are specified, # and no features are set ts = "x86-0" assert not hl.Target.validate_target_string(ts) ts = "0-arch_unknown-os_unknown-sse41" assert not hl.Target.validate_target_string(ts) # "host" is only supported as the first token ts = "opencl-host" assert not hl.Target.validate_target_string(ts) # set_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) assert t1.has_feature(hl.TargetFeature.SSE41) assert not t1.has_feature(hl.TargetFeature.AVX) t1.set_feature(hl.TargetFeature.AVX) t1.set_feature(hl.TargetFeature.SSE41, False) assert t1.has_feature(hl.TargetFeature.AVX) assert not t1.has_feature(hl.TargetFeature.SSE41) # set_features t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) assert t1.has_feature(hl.TargetFeature.SSE41) assert not t1.has_feature(hl.TargetFeature.AVX) t1.set_features([hl.TargetFeature.SSE41], False) t1.set_features([hl.TargetFeature.AVX, hl.TargetFeature.AVX2], True) assert t1.has_feature(hl.TargetFeature.AVX) assert t1.has_feature(hl.TargetFeature.AVX2) assert not t1.has_feature(hl.TargetFeature.SSE41) # with_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) t2 = t1.with_feature(hl.TargetFeature.NoAsserts).with_feature( hl.TargetFeature.NoBoundsQuery) ts = t2.to_string() assert ts == "x86-32-linux-no_asserts-no_bounds_query-sse41" # without_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41, hl.TargetFeature.NoAsserts]) # Note that NoBoundsQuery wasn't set here, so 'without' is a no-op t2 = t1.without_feature(hl.TargetFeature.NoAsserts).without_feature( hl.TargetFeature.NoBoundsQuery) ts = t2.to_string() assert ts == "x86-32-linux-sse41" # natural_vector_size # SSE4.1 is 16 bytes wide t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.SSE41]) assert t1.natural_vector_size(hl.UInt(8)) == 16 assert t1.natural_vector_size(hl.Int(16)) == 8 assert t1.natural_vector_size(hl.UInt(32)) == 4 assert t1.natural_vector_size(hl.Float(32)) == 4 # has_gpu_feature t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, [hl.TargetFeature.OpenCL]) t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, []) assert t1.has_gpu_feature() assert not t2.has_gpu_feature() # has_large_buffers & maximum_buffer_size t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [hl.TargetFeature.LargeBuffers]) t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, []) assert t1.has_large_buffers() assert t1.maximum_buffer_size() == 9223372036854775807 assert not t2.has_large_buffers() assert t2.maximum_buffer_size() == 2147483647 # supports_device_api t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64, [hl.TargetFeature.CUDA]) t2 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 64) assert t1.supports_device_api(hl.DeviceAPI.CUDA) assert not t2.supports_device_api(hl.DeviceAPI.CUDA) # supports_type (deprecated version) t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64, [hl.TargetFeature.Metal]) t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64) assert not t1.supports_type(hl.Float(64)) assert t2.supports_type(hl.Float(64)) # supports_type (preferred version) t1 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64, [hl.TargetFeature.Metal]) t2 = hl.Target(hl.TargetOS.OSX, hl.TargetArch.X86, 64) assert not t1.supports_type(hl.Float(64), hl.DeviceAPI.Metal) assert not t2.supports_type(hl.Float(64), hl.DeviceAPI.Metal) # target_feature_for_device_api assert hl.target_feature_for_device_api( hl.DeviceAPI.OpenCL) == hl.TargetFeature.OpenCL # with_feature with non-convertible lists try: t1 = hl.Target(hl.TargetOS.Linux, hl.TargetArch.X86, 32, ["this is a string"]) except TypeError as e: assert "incompatible constructor arguments" in str(e) else: assert False, 'Did not see expected exception!'
def main(): # We'll define the simple one-stage pipeline that we used in lesson 10. brighter = hl.Func("brighter") x, y = hl.Var("x"), hl.Var("y") # Declare the arguments. offset = hl.Param(hl.UInt(8)) input = hl.ImageParam(hl.UInt(8), 2) args = [input, offset] # Define the hl.Func. brighter[x, y] = input[x, y] + offset # Schedule it. brighter.vectorize(x, 16).parallel(y) # The following line is what we did in lesson 10. It compiles an # object file suitable for the system that you're running this # program on. For example, if you compile and run this file on # 64-bit linux on an x86 cpu with sse4.1, then the generated code # will be suitable for 64-bit linux on x86 with sse4.1. brighter.compile_to_file("lesson_11_host", args, "lesson_11_host") # We can also compile object files suitable for other cpus and # operating systems. You do this with an optional third argument # to compile_to_file which specifies the target to compile for. create_android = True create_windows = True create_ios = True if create_android: # Let's use this to compile a 32-bit arm android version of this code: target = hl.Target() target.os = hl.TargetOS.Android # The operating system target.arch = hl.TargetArch.ARM # The CPU architecture target.bits = 32 # The bit-width of the architecture arm_features = [] # A list of features to set target.set_features(arm_features) # Pass the target as the last argument. brighter.compile_to_file("lesson_11_arm_32_android", args, "lesson_11_arm_32_android", target) if create_windows: # And now a Windows object file for 64-bit x86 with AVX and SSE 4.1: target = hl.Target() target.os = hl.TargetOS.Windows target.arch = hl.TargetArch.X86 target.bits = 64 target.set_features([hl.TargetFeature.AVX, hl.TargetFeature.SSE41]) brighter.compile_to_file("lesson_11_x86_64_windows", args, "lesson_11_x86_64_windows", target) if create_ios: # And finally an iOS mach-o object file for one of Apple's 32-bit # ARM processors - the A6. It's used in the iPhone 5. The A6 uses # a slightly modified ARM architecture called ARMv7s. We specify # this using the target features field. Support for Apple's # 64-bit ARM processors is very new in llvm, and still somewhat # flaky. target = hl.Target() target.os = hl.TargetOS.IOS target.arch = hl.TargetArch.ARM target.bits = 32 target.set_features([hl.TargetFeature.ARMv7s]) brighter.compile_to_file("lesson_11_arm_32_ios", args, "lesson_11_arm_32_ios", target) # Now let's check these files are what they claim, by examining # their first few bytes. if create_android: # 32-arm android object files start with the magic bytes: # uint8_t [] arm_32_android_magic = [ 0x7f, ord('E'), ord('L'), ord('F'), # ELF format 1, # 32-bit 1, # 2's complement little-endian 1 ] # Current version of elf length = len(arm_32_android_magic) f = open("lesson_11_arm_32_android.o", "rb") try: header_bytes = f.read(length) except: print("Android object file not generated") return -1 f.close() header = list(unpack("B" * length, header_bytes)) if header != arm_32_android_magic: print([x == y for x, y in zip(header, arm_32_android_magic)]) raise Exception( "Unexpected header bytes in 32-bit arm object file.") return -1 if create_windows: # 64-bit windows object files start with the magic 16-bit value 0x8664 # (presumably referring to x86-64) # uint8_t [] win_64_magic = [0x64, 0x86] f = open("lesson_11_x86_64_windows.obj", "rb") try: header_bytes = f.read(2) except: print("Windows object file not generated") return -1 f.close() header = list(unpack("B" * 2, header_bytes)) if header != win_64_magic: raise Exception( "Unexpected header bytes in 64-bit windows object file.") return -1 if create_ios: # 32-bit arm iOS mach-o files start with the following magic bytes: # uint32_t [] arm_32_ios_magic = [ 0xfeedface, # Mach-o magic bytes #0xfe, 0xed, 0xfa, 0xce, # Mach-o magic bytes 12, # CPU type is ARM 11, # CPU subtype is ARMv7s 1 ] # It's a relocatable object file. f = open("lesson_11_arm_32_ios.o", "rb") try: header_bytes = f.read(4 * 4) except: print("ios object file not generated") return -1 f.close() header = list(unpack("I" * 4, header_bytes)) if header != arm_32_ios_magic: raise Exception( "Unexpected header bytes in 32-bit arm ios object file.") return -1 # It looks like the object files we produced are plausible for # those targets. We'll count that as a success for the purposes # of this tutorial. For a real application you'd then need to # figure out how to integrate Halide into your cross-compilation # toolchain. There are several small examples of this in the # Halide repository under the apps folder. See HelloAndroid and # HelloiOS here: # https:#github.com/halide/Halide/tree/master/apps/ print("Success!") return 0