def desaturate_noise(input, width, height): print(' desaturate_noise') output = hl.Func("desaturate_noise_output") x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") input_mirror = hl.BoundaryConditions.mirror_image(input, [(0, width), (0, height)]) blur = gauss_15x15(gauss_15x15(input_mirror, "desaturate_noise_blur1"), "desaturate_noise_blur_2") factor = 1.4 threshold = 25000 output[x, y, c] = input[x, y, c] output[x, y, 1] = hl.select((hl.abs(blur[x, y, 1]) / hl.abs(input[x, y, 1]) < factor) & (hl.abs(input[x, y, 1]) < threshold) & (hl.abs(blur[x, y, 1]) < threshold), 0.7 * blur[x, y, 1] + 0.3 * input[x, y, 1], input[x, y, 1]) output[x, y, 2] = hl.select((hl.abs(blur[x, y, 2]) / hl.abs(input[x, y, 2]) < factor) & (hl.abs(input[x, y, 2]) < threshold) & (hl.abs(blur[x, y, 2]) < threshold), 0.7 * blur[x, y, 2] + 0.3 * input[x, y, 2], input[x, y, 2]) output.compute_root().parallel(y).vectorize(x, 16) return output
def expand_layer(x, y, c, img): expanded = hl.Func('expanded') expanded[x, y, c] = hl.select(((x % 2 == 0) & (y % 2 == 0)), img[x // 2, y // 2, c], 0.0) blurred = gaussian(x, y, c, expanded) expanded2 = mkfunc("expand", img) expanded2[x,y,c] = blurred[x,y,c] * 4.0 return expanded2
def test_basics3(): input = hl.ImageParam(hl.Float(32), 3, 'input') r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable s_sigma = 8 # This is passed during code generation in the C++ version x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') # Add a boundary condition clamped = hl.Func('clamped') clamped[x, y] = input[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1),0] # Construct the bilateral grid r = hl.RDom(0, s_sigma, 0, s_sigma, 'r') val = clamped[x * s_sigma + r.x - s_sigma//2, y * s_sigma + r.y - s_sigma//2] val = hl.clamp(val, 0.0, 1.0) #zi = hl.cast(hl.Int(32), val * (1.0/r_sigma) + 0.5) zi = hl.cast(hl.Int(32), (val / r_sigma) + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 ss = hl.select(c == 0, val, 1.0) print("hl.select(c == 0, val, 1.0)", ss) left = histogram[x, y, zi, c] print("histogram[x, y, zi, c]", histogram[x, y, zi, c]) print("histogram[x, y, zi, c]", left) left += 5 print("histogram[x, y, zi, c] after += 5", left) left += ss return
def test_basics3(): input = hl.ImageParam(hl.Float(32), 3, 'input') r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable s_sigma = 8 # This is passed during code generation in the C++ version x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') # Add a boundary condition clamped = hl.Func('clamped') clamped[x, y] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), 0] # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] val = hl.clamp(val, 0.0, 1.0) zi = hl.i32((val / r_sigma) + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 ss = hl.select(c == 0, val, 1.0) left = histogram[x, y, zi, c] left += 5 left += ss
def merge_temporal(images, alignment): weight = hl.Func("merge_temporal_weights") total_weight = hl.Func("merge_temporal_total_weights") output = hl.Func("merge_temporal_output") ix, iy, tx, ty, n = hl.Var('ix'), hl.Var('iy'), hl.Var('tx'), hl.Var('ty'), hl.Var('n') rdom0 = hl.RDom([(0, 16), (0, 16)]) rdom1 = hl.RDom([(1, images.dim(2).extent() - 1)]) imgs_mirror = hl.BoundaryConditions.mirror_interior(images, [(0, images.width()), (0, images.height())]) layer = box_down2(imgs_mirror, "merge_layer") offset = Point(alignment[tx, ty, n]).clamp(Point(MINIMUM_OFFSET, MINIMUM_OFFSET), Point(MAXIMUM_OFFSET, MAXIMUM_OFFSET)) al_x = idx_layer(tx, rdom0.x) + offset.x / 2 al_y = idx_layer(ty, rdom0.y) + offset.y / 2 ref_val = layer[idx_layer(tx, rdom0.x), idx_layer(ty, rdom0.y), 0] alt_val = layer[al_x, al_y, n] factor = 8.0 min_distance = 10 max_distance = 300 # max L1 distance, otherwise the value is not used distance = hl.sum(hl.abs(hl.cast(hl.Int(32), ref_val) - hl.cast(hl.Int(32), alt_val))) / 256 normal_distance = hl.max(1, hl.cast(hl.Int(32), distance) / factor - min_distance / factor) # Weight for the alternate frame weight[tx, ty, n] = hl.select(normal_distance > (max_distance - min_distance), 0.0, 1.0 / normal_distance) total_weight[tx, ty] = hl.sum(weight[tx, ty, rdom1]) + 1 offset = Point(alignment[tx, ty, rdom1]) al_x = idx_im(tx, ix) + offset.x al_y = idx_im(ty, iy) + offset.y ref_val = imgs_mirror[idx_im(tx, ix), idx_im(ty, iy), 0] alt_val = imgs_mirror[al_x, al_y, rdom1] # Sum all values according to their weight, and divide by total weight to obtain average output[ix, iy, tx, ty] = hl.sum(weight[tx, ty, rdom1] * alt_val / total_weight[tx, ty]) + ref_val / total_weight[ tx, ty] weight.compute_root().parallel(ty).vectorize(tx, 16) total_weight.compute_root().parallel(ty).vectorize(tx, 16) output.compute_root().parallel(ty).vectorize(ix, 32) return output
def test_minmax(): x = hl.Var() f = hl.Func() f[x] = hl.select(x == 0, hl.min(x, 1), (x == 2) | (x == 4), i32(hl.min(f32(x), 3.2, x * 2.1)), x == 3, hl.max(x, x * 3, 1, x * 4), x) b = f.realize(5) assert b[0] == 0 assert b[1] == 1, b[1] assert b[2] == 2 assert b[3] == 12 assert b[4] == 3
def shift_bayer_to_rggb(input, cfa_pattern): print(f'cfa_pattern: {cfa_pattern}') output = hl.Func("rggb_input") x, y = hl.Var("x"), hl.Var("y") cfa = hl.u16(cfa_pattern) output[x, y] = hl.select(cfa == hl.u16(1), input[x, y], cfa == hl.u16(2), input[x + 1, y], cfa == hl.u16(4), input[x, y + 1], cfa == hl.u16(3), input[x + 1, y + 1], 0) return output
def test_multipass_constraints(): input = hl.ImageParam(hl.Float(32), 2, "input") f = hl.Func("f") x = hl.Var("x") y = hl.Var("y") f[x, y] = input[x+1, y+1] + input[x-1, y-1] f[x, y] += 3.0 f.update().vectorize(x, 4) o = f.output_buffer() # Now make some hard-to-resolve constraints input.dim(0).set_bounds( min = input.dim(1).min() - 5, extent = input.dim(1).extent() + o.dim(0).extent() ) o.dim(0).set_bounds(min = 0, extent = hl.select(o.dim(0).extent() < 22, o.dim(0).extent() + 1, o.dim(0).extent())) # Make a bounds query buffer query_buf = hl.Buffer.make_bounds_query(type = hl.Float(32), sizes = [7, 8]) query_buf.set_min([2, 2]) f.infer_input_bounds(query_buf) if input.get().dim(0).min() != -4 or \ input.get().dim(0).extent() != 34 or \ input.get().dim(1).min() != 1 or \ input.get().dim(1).extent() != 10 or \ query_buf.dim(0).min() != 0 or \ query_buf.dim(0).extent() != 24 or \ query_buf.dim(1).min() != 2 or \ query_buf.dim(1).extent() != 8: print("Constraints not correctly satisfied:\n", "in:", input.get().dim(0).min(), input.get().dim(0).extent(), input.get().dim(1).min(), input.get().dim(1).extent(), "out:", query_buf.dim(0).min(), query_buf.dim(0).extent(), query_buf.dim(1).min(), query_buf.dim(1).extent()) assert False
def gamma_inverse(input): output = hl.Func("gamma_inverse_output") x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") cutoff = 2575 gamma_toe = 0.0774 gamma_pow = 2.4 gamma_fac = 57632.49226 gamma_con = 0.055 if input.dimensions() == 2: output[x, y] = hl.u16(hl.select(input[x, y] < cutoff, gamma_toe * input[x, y], hl.pow(hl.f32(input[x, y]) / 65535 + gamma_con, gamma_pow) * gamma_fac)) else: output[x, y, c] = hl.u16(hl.select(input[x, y, c] < cutoff, gamma_toe * input[x, y, c], hl.pow(hl.f32(input[x, y, c]) / 65535 + gamma_con, gamma_pow) * gamma_fac)) output.compute_root().parallel(y).vectorize(x, 16) return output
def gamma_correct(input): output = hl.Func("gamma_correct_output") x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") cutoff = 200 gamma_toe = 12.92 gamma_pow = 0.416667 gamma_fac = 680.552897 gamma_con = -3604.425 if input.dimensions() == 2: output[x, y] = hl.u16(hl.select(input[x, y] < cutoff, gamma_toe * input[x, y], gamma_fac * hl.pow(input[x, y], gamma_pow) + gamma_con)) else: output[x, y, c] = hl.u16(hl.select(input[x, y, c] < cutoff, gamma_toe * input[x, y, c], gamma_fac * hl.pow(input[x, y, c], gamma_pow) + gamma_con)) output.compute_root().parallel(y).vectorize(x, 16) return output
def test_multipass_constraints(): input = hl.ImageParam(hl.Float(32), 2, "input") f = hl.Func("f") x = hl.Var("x") y = hl.Var("y") f[x, y] = input[x + 1, y + 1] + input[x - 1, y - 1] f[x, y] += 3.0 f.update().vectorize(x, 4) o = f.output_buffer() # Now make some hard-to-resolve constraints input.dim(0).set_bounds(min=input.dim(1).min() - 5, extent=input.dim(1).extent() + o.dim(0).extent()) o.dim(0).set_bounds(min=0, extent=hl.select( o.dim(0).extent() < 22, o.dim(0).extent() + 1, o.dim(0).extent())) # Make a bounds query buffer query_buf = hl.Buffer.make_bounds_query(type=hl.Float(32), sizes=[7, 8]) query_buf.set_min([2, 2]) f.infer_input_bounds(query_buf) if input.get().dim(0).min() != -4 or \ input.get().dim(0).extent() != 34 or \ input.get().dim(1).min() != 1 or \ input.get().dim(1).extent() != 10 or \ query_buf.dim(0).min() != 0 or \ query_buf.dim(0).extent() != 24 or \ query_buf.dim(1).min() != 2 or \ query_buf.dim(1).extent() != 8: print("Constraints not correctly satisfied:\n", "in:", input.get().dim(0).min(), input.get().dim(0).extent(), input.get().dim(1).min(), input.get().dim(1).extent(), "out:", query_buf.dim(0).min(), query_buf.dim(0).extent(), query_buf.dim(1).min(), query_buf.dim(1).extent()) assert False
def test_select(): x = hl.Var() f = hl.Func() f[x] = hl.select( x == 0, 31, x == 2, (x * 24), x == 2, 999, # should be ignored: first condition wins x) b = f.realize(4) assert b[0] == 31 assert b[1] == 1 assert b[2] == 48 assert b[3] == 3
def main(): # So far Funcs (such as the one below) have evaluated to a single # scalar value for each point in their domain. single_valued = hl.Func() x, y = hl.Var("x"), hl.Var("y") single_valued[x, y] = x + y # One way to write a hl.Func that returns a collection of values is # to add an additional dimension which indexes that # collection. This is how we typically deal with color. For # example, the hl.Func below represents a collection of three values # for every x, y coordinate indexed by c. color_image = hl.Func() c = hl.Var("c") color_image[x, y, c] = hl.select( c == 0, 245, # Red value c == 1, 42, # Green value 132) # Blue value # Since this pattern appears quite often, Halide provides a # syntatic sugar to write the code above as the following, # using the "mux" function. # color_image[x, y, c] = hl.mux(c, [245, 42, 132]); # This method is often convenient because it makes it easy to # operate on this hl.Func in a way that treats each item in the # collection equally: brighter = hl.Func() brighter[x, y, c] = color_image[x, y, c] + 10 # However this method is also inconvenient for three reasons. # # 1) Funcs are defined over an infinite domain, so users of this # hl.Func can for example access color_image(x, y, -17), which is # not a meaningful value and is probably indicative of a bug. # # 2) It requires a hl.select, which can impact performance if not # bounded and unrolled: # brighter.bound(c, 0, 3).unroll(c) # # 3) With this method, all values in the collection must have the # same type. While the above two issues are merely inconvenient, # this one is a hard limitation that makes it impossible to # express certain things in this way. # It is also possible to represent a collection of values as a # collection of Funcs: func_array = [hl.Func() for i in range(3)] func_array[0][x, y] = x + y func_array[1][x, y] = hl.sin(x) func_array[2][x, y] = hl.cos(y) # This method avoids the three problems above, but introduces a # new annoyance. Because these are separate Funcs, it is # difficult to schedule them so that they are all computed # together inside a single loop over x, y. # A third alternative is to define a hl.Func as evaluating to a # Tuple instead of an hl.Expr. A Tuple is a fixed-size collection of # Exprs which may have different type. The following function # evaluates to an integer value (x+y), and a floating point value # (hl.sin(x*y)). multi_valued = hl.Func("multi_valued") multi_valued[x, y] = (x + y, hl.sin(x * y)) # Realizing a tuple-valued hl.Func returns a collection of # Buffers. We call this a Realization. It's equivalent to a # std::vector of hl.Buffer/Image objects: if True: im1, im2 = multi_valued.realize([80, 60]) assert im1.type() == hl.Int(32) assert im2.type() == hl.Float(32) assert im1[30, 40] == 30 + 40 assert np.isclose(im2[30, 40], math.sin(30 * 40)) # You can also pass a tuple of pre-allocated buffers to realize() # rather than having new ones created. (The Buffers must have the correct # types and have identical sizes.) if True: im1, im2 = hl.Buffer(hl.Int(32), [80, 60]), hl.Buffer(hl.Float(32), [80, 60]) multi_valued.realize((im1, im2)) assert im1[30, 40] == 30 + 40 assert np.isclose(im2[30, 40], math.sin(30 * 40)) # All Tuple elements are evaluated together over the same domain # in the same loop nest, but stored in distinct allocations. The # equivalent C++ code to the above is: if True: multi_valued_0 = np.empty((80 * 60), dtype=np.int32) multi_valued_1 = np.empty((80 * 60), dtype=np.int32) for yy in range(80): for xx in range(60): multi_valued_0[xx + 60 * yy] = xx + yy multi_valued_1[xx + 60 * yy] = math.sin(xx * yy) # When compiling ahead-of-time, a Tuple-valued hl.Func evaluates # into multiple distinct output halide_buffer_t structs. These appear in # order at the end of the function signature: # int multi_valued(...input buffers and params..., halide_buffer_t # *output_1, halide_buffer_t *output_2) # You can construct a Tuple by passing multiple Exprs to the # Tuple constructor as we did above. Perhaps more elegantly, you # can also take advantage of initializer lists and just # enclose your Exprs in braces: multi_valued_2 = hl.Func("multi_valued_2") multi_valued_2[x, y] = (x + y, hl.sin(x * y)) # Calls to a multi-valued hl.Func cannot be treated as Exprs. The # following is a syntax error: # hl.Func consumer # consumer[x, y] = multi_valued_2[x, y] + 10 # Instead you must index the returned object with square brackets # to retrieve the individual Exprs: integer_part = multi_valued_2[x, y][0] floating_part = multi_valued_2[x, y][1] assert type(integer_part) is hl.FuncTupleElementRef assert type(floating_part) is hl.FuncTupleElementRef consumer = hl.Func() consumer[x, y] = (integer_part + 10, floating_part + 10.0) # Tuple reductions. if True: # Tuples are particularly useful in reductions, as they allow # the reduction to maintain complex state as it walks along # its domain. The simplest example is an argmax. # First we create an Image to take the argmax over. input_func = hl.Func() input_func[x] = hl.sin(x) input = input_func.realize([100]) assert input.type() == hl.Float(32) # Then we defined a 2-valued Tuple which tracks the maximum value # its index. arg_max = hl.Func() # Pure definition. # (using [()] for zero-dimensional Funcs is a convention of this python interface) arg_max[()] = (0, input[0]) # Update definition. r = hl.RDom([(1, 99)]) old_index = arg_max[()][0] old_max = arg_max[()][1] new_index = hl.select(old_max > input[r], r, old_index) new_max = hl.max(input[r], old_max) arg_max[()] = (new_index, new_max) # The equivalent C++ is: arg_max_0 = 0 arg_max_1 = float(input[0]) for r in range(1, 100): old_index = arg_max_0 old_max = arg_max_1 new_index = r if (old_max > input[r]) else old_index new_max = max(input[r], old_max) # In a tuple update definition, all loads and computation # are done before any stores, so that all Tuple elements # are updated atomically with respect to recursive calls # to the same hl.Func. arg_max_0 = new_index arg_max_1 = new_max # Let's verify that the Halide and C++ found the same maximum # value and index. if True: r0, r1 = arg_max.realize() assert r0.type() == hl.Int(32) assert r1.type() == hl.Float(32) assert arg_max_0 == r0[()] assert np.isclose(arg_max_1, r1[()]) # Halide provides argmax and hl.argmin as built-in reductions # similar to sum, product, maximum, and minimum. They return # a Tuple consisting of the point in the reduction domain # corresponding to that value, and the value itself. In the # case of ties they return the first value found. We'll use # one of these in the following section. # Tuples for user-defined types. if True: # Tuples can also be a convenient way to represent compound # objects such as complex numbers. Defining an object that # can be converted to and from a Tuple is one way to extend # Halide's type system with user-defined types. class Complex: def __init__(self, r, i=None): if type(r) is float and type(i) is float: self.real = hl.Expr(r) self.imag = hl.Expr(i) elif i is not None: self.real = r self.imag = i else: self.real = r[0] self.imag = r[1] def as_tuple(self): "Convert to a Tuple" return (self.real, self.imag) def __add__(self, other): "Complex addition" return Complex(self.real + other.real, self.imag + other.imag) def __mul__(self, other): "Complex multiplication" return Complex(self.real * other.real - self.imag * other.imag, self.real * other.imag + self.imag * other.real) def __getitem__(self, idx): return (self.real, self.imag)[idx] def __len__(self): return 2 def magnitude(self): "Complex magnitude" return (self.real * self.real) + (self.imag * self.imag) # Other complex operators would go here. The above are # sufficient for this example. # Let's use the Complex struct to compute a Mandelbrot set. mandelbrot = hl.Func() # The initial complex value corresponding to an x, y coordinate # in our hl.Func. initial = Complex(x / 15.0 - 2.5, y / 6.0 - 2.0) # Pure definition. t = hl.Var("t") mandelbrot[x, y, t] = Complex(0.0, 0.0) # We'll use an update definition to take 12 steps. r = hl.RDom([(1, 12)]) current = Complex(mandelbrot[x, y, r - 1]) # The following line uses the complex multiplication and # addition we defined above. mandelbrot[x, y, r] = (Complex(current * current) + initial) # We'll use another tuple reduction to compute the iteration # number where the value first escapes a circle of radius 4. # This can be expressed as an hl.argmin of a boolean - we want # the index of the first time the given boolean expression is # false (we consider false to be less than true). The argmax # would return the index of the first time the expression is # true. escape_condition = Complex(mandelbrot[x, y, r]).magnitude() < 16.0 first_escape = hl.argmin(escape_condition) assert type(first_escape) is tuple # We only want the index, not the value, but hl.argmin returns # both, so we'll index the hl.argmin Tuple expression using # square brackets to get the hl.Expr representing the index. escape = hl.Func() escape[x, y] = first_escape[0] # Realize the pipeline and print the result as ascii art. result = escape.realize([61, 25]) assert result.type() == hl.Int(32) code = " .:-~*={&%#@" for yy in range(result.height()): for xx in range(result.width()): index = result[xx, yy] if index < len(code): print("%c" % code[index], end="") else: pass # is lesson 13 cpp version buggy ? print("") print("Success!") return 0
def demosaic(input, width, height): print(f'width: {width}, height: {height}') f0 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f0") f1 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f1") f2 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f2") f3 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f3") f0.translate([-2, -2]) f1.translate([-2, -2]) f2.translate([-2, -2]) f3.translate([-2, -2]) d0 = hl.Func("demosaic_0") d1 = hl.Func("demosaic_1") d2 = hl.Func("demosaic_2") d3 = hl.Func("demosaic_3") output = hl.Func("demosaic_output") x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") rdom0 = hl.RDom([(-2, 5), (-2, 5)]) # rdom1 = hl.RDom([(0, width / 2), (0, height / 2)]) input_mirror = hl.BoundaryConditions.mirror_interior(input, [(0, width), (0, height)]) f0.fill(0) f1.fill(0) f2.fill(0) f3.fill(0) f0_sum = 8 f1_sum = 16 f2_sum = 16 f3_sum = 16 f0[0, -2] = -1 f0[0, -1] = 2 f0[-2, 0] = -1 f0[-1, 0] = 2 f0[0, 0] = 4 f0[1, 0] = 2 f0[2, 0] = -1 f0[0, 1] = 2 f0[0, 2] = -1 f1[0, -2] = 1 f1[-1, -1] = -2 f1[1, -1] = -2 f1[-2, 0] = -2 f1[-1, 0] = 8 f1[0, 0] = 10 f1[1, 0] = 8 f1[2, 0] = -2 f1[-1, 1] = -2 f1[1, 1] = -2 f1[0, 2] = 1 f2[0, -2] = -2 f2[-1, -1] = -2 f2[0, -1] = 8 f2[1, -1] = -2 f2[-2, 0] = 1 f2[0, 0] = 10 f2[2, 0] = 1 f2[-1, 1] = -2 f2[0, 1] = 8 f2[1, 1] = -2 f2[0, 2] = -2 f3[0, -2] = -3 f3[-1, -1] = 4 f3[1, -1] = 4 f3[-2, 0] = -3 f3[0, 0] = 12 f3[2, 0] = -3 f3[-1, 1] = 4 f3[1, 1] = 4 f3[0, 2] = -3 d0[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f0[rdom0.x, rdom0.y]) / f0_sum) d1[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f1[rdom0.x, rdom0.y]) / f1_sum) d2[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f2[rdom0.x, rdom0.y]) / f2_sum) d3[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f3[rdom0.x, rdom0.y]) / f3_sum) R_row = y % 2 == 0 B_row = y % 2 != 0 R_col = x % 2 == 0 B_col = x % 2 != 0 at_R = c == 0 at_G = c == 1 at_B = c == 2 output[x, y, c] = hl.select(at_R & R_row & B_col, d1[x, y], at_R & B_row & R_col, d2[x, y], at_R & B_row & B_col, d3[x, y], at_G & R_row & R_col, d0[x, y], at_G & B_row & B_col, d0[x, y], at_B & B_row & R_col, d1[x, y], at_B & R_row & B_col, d2[x, y], at_B & R_row & R_col, d3[x, y], input[x, y]) d0.compute_root().parallel(y).vectorize(x, 16) d1.compute_root().parallel(y).vectorize(x, 16) d2.compute_root().parallel(y).vectorize(x, 16) d3.compute_root().parallel(y).vectorize(x, 16) output.compute_root().parallel(y).align_bounds(x, 2).unroll(x, 2).align_bounds(y, 2).unroll(y, 2).vectorize(x, 16) return output
def gen_g(self): ''' define g() function ''' # vars i, j, k, l = [self.vars[c] for c in "ijkl"] # clamped inputs x, y, z, expnt, fm, rnorm = [ self.clamps[c] for c in ["x", "y", "z", "expnt", "fm", "rnorm"] ] # unclamped input (for sizing) fm_in = self.inputs["fm_in"] # scalar inputs delo2, delta, rdelta = [ self.inputs[c] for c in ["delo2", "delta", "rdelta"] ] dx = hl.Func("dx") dy = hl.Func("dy") dz = hl.Func("dz") r2 = hl.Func("g_r2") expnt2 = hl.Func("expnt2") expnt_inv = hl.Func("expnt_inv") self.add_funcs_by_name([dx, dy, dz, r2, expnt2, expnt_inv]) dx[i, j] = x[i] - x[j] dy[i, j] = y[i] - y[j] dz[i, j] = z[i] - z[j] r2[i, j] = dx[i, j] * dx[i, j] + dy[i, j] * dy[i, j] + dz[i, j] * dz[i, j] expnt2[i, j] = expnt[i] + expnt[j] expnt_inv[i, j] = hl.f64(1.0) / expnt2[i, j] fac2 = hl.Func("fac2") ex_arg = hl.Func("ex_arg") ex = hl.Func("ex") denom = hl.Func("denom") fac4d = hl.Func("fac4d") self.add_funcs_by_name([fac2, ex_arg, ex, denom, fac4d]) fac2[i, j] = expnt[i] * expnt[j] * expnt_inv[i, j] ex_arg[i, j, k, l] = -fac2[i, j] * r2[i, j] - fac2[k, l] * r2[k, l] ex[i, j, k, l] = hl.select(ex_arg[i, j, k, l] < hl.f64(-37.0), hl.f64(0.0), hl.exp(ex_arg[i, j, k, l])) denom[i, j, k, l] = expnt2[i, j] * expnt2[k, l] * hl.sqrt(expnt2[i, j] + expnt2[k, l]) fac4d[i, j, k, l] = expnt2[i, j] * expnt2[k, l] / (expnt2[i, j] + expnt2[k, l]) x2 = hl.Func("g_x2") y2 = hl.Func("g_y2") z2 = hl.Func("g_z2") rpq2 = hl.Func("rpq2") self.add_funcs_by_name([x2, y2, z2, rpq2]) x2[i, j] = (x[i] * expnt[i] + x[j] * expnt[j]) * expnt_inv[i, j] y2[i, j] = (y[i] * expnt[i] + y[j] * expnt[j]) * expnt_inv[i, j] z2[i, j] = (z[i] * expnt[i] + z[j] * expnt[j]) * expnt_inv[i, j] rpq2[i, j, k, l] = ((x2[i, j] - x2[k, l]) * (x2[i, j] - x2[k, l]) + (y2[i, j] - y2[k, l]) * (y2[i, j] - y2[k, l]) + (z2[i, j] - z2[k, l]) * (z2[i, j] - z2[k, l])) f0t = hl.Func("f0t") f0n = hl.Func("f0n") f0x = hl.Func("f0x") f0val = hl.Func("f0val") self.add_funcs_by_name([f0t, f0n, f0x, f0val]) f0t[i, j, k, l] = fac4d[i, j, k, l] * rpq2[i, j, k, l] f0n[i, j, k, l] = hl.clamp(hl.i32((f0t[i, j, k, l] + delo2) * rdelta), fm_in.dim(0).min(), fm_in.dim(0).max()) f0x[i, j, k, l] = delta * f0n[i, j, k, l] - f0t[i, j, k, l] f0val[i, j, k, l] = hl.select( f0t[i, j, k, l] >= hl.f64(28.0), hl.f64(0.88622692545276) / hl.sqrt(f0t[i, j, k, l]), fm[f0n[i, j, k, l], 0] + f0x[i, j, k, l] * (fm[f0n[i, j, k, l], 1] + f0x[i, j, k, l] * hl.f64(0.5) * (fm[f0n[i, j, k, l], 2] + f0x[i, j, k, l] * hl.f64(1. / 3.) * (fm[f0n[i, j, k, l], 3] + f0x[i, j, k, l] * hl.f64(0.25) * fm[f0n[i, j, k, l], 4])))) g = hl.Func("g") self.add_funcs_by_name([g]) if self.tracing and self.tracing_g: g_trace_in = hl.ImageParam(hl.Float(64), 4, "g_trace_in") g_trace = hl.BoundaryConditions.constant_exterior(g_trace_in, 0) self.inputs["g_trace_in"] = g_trace_in self.clamps["g_trace"] = g_trace g_trace.compute_root() g[i, j, k, l] = (hl.f64(2.00) * hl.f64(pow(pi, 2.50)) / denom[i, j, k, l] ) * ex[i, j, k, l] * f0val[i, j, k, l] * rnorm[i] * rnorm[ j] * rnorm[k] * rnorm[l] + g_trace[i, j, k, l] else: g_trace = None g[i, j, k, l] = (hl.f64(2.00) * hl.f64(pow(pi, 2.50)) / denom[i, j, k, l]) * ex[i, j, k, l] * f0val[ i, j, k, l] * rnorm[i] * rnorm[j] * rnorm[k] * rnorm[l]
def findStereoCorrespondence(left, right, SADWindowSize, minDisparity, numDisparities, xmin, xmax, ymin, ymax, x_tile_size=32, y_tile_size=32, test=False, uniquenessRatio=0.15, disp12MaxDiff=1): """ Returns Func (left: Func, right: Func) """ x, y, c, d = Var("x"), Var("y"), Var("c"), Var("d") diff = Func("diff") diff[d, x, y] = h.cast(UInt(16), h.abs(left[x, y] - right[x - d, y])) win2 = SADWindowSize / 2 diff_T = Func("diff_T") xi, xo, yi, yo = Var("xi"), Var("xo"), Var("yi"), Var("yo") diff_T[d, xi, yi, xo, yo] = diff[d, xi + xo * x_tile_size + xmin, yi + yo * y_tile_size + ymin] cSAD, vsum = Func("cSAD"), Func("vsum") rk = RDom(-win2, SADWindowSize, "rk") rxi, ryi = RDom(1, x_tile_size - 1, "rxi"), RDom(1, y_tile_size - 1, "ryi") if test: vsum[d, xi, yi, xo, yo] = h.sum(diff_T[d, xi, yi + rk, xo, yo]) cSAD[d, xi, yi, xo, yo] = h.sum(vsum[d, xi + rk, yi, xo, yo]) else: vsum[d, xi, yi, xo, yo] = h.select(yi != 0, h.cast(UInt(16), 0), h.sum(diff_T[d, xi, rk, xo, yo])) vsum[d, xi, ryi, xo, yo] = vsum[d, xi, ryi - 1, xo, yo] + diff_T[ d, xi, ryi + win2, xo, yo] - diff_T[d, xi, ryi - win2 - 1, xo, yo] cSAD[d, xi, yi, xo, yo] = h.select(xi != 0, h.cast(UInt(16), 0), h.sum(vsum[d, rk, yi, xo, yo])) cSAD[d, rxi, yi, xo, yo] = cSAD[d, rxi - 1, yi, xo, yo] + vsum[d, rxi + win2, yi, xo, yo] - vsum[d, rxi - win2 - 1, yi, xo, yo] rd = RDom(minDisparity, numDisparities) disp_left = Func("disp_left") disp_left[xi, yi, xo, yo] = h.Tuple(h.cast(UInt(16), minDisparity), h.cast(UInt(16), (2 << 16) - 1)) disp_left[xi, yi, xo, yo] = h.tuple_select( cSAD[rd, xi, yi, xo, yo] < disp_left[xi, yi, xo, yo][1], h.Tuple(h.cast(UInt(16), rd), cSAD[rd, xi, yi, xo, yo]), h.Tuple(disp_left[xi, yi, xo, yo])) FILTERED = -16 disp = Func("disp") disp[x, y] = h.select( # x > xmax-xmin or y > ymax-ymin, x < xmax, h.cast( UInt(16), disp_left[x % x_tile_size, y % y_tile_size, x / x_tile_size, y / y_tile_size][0]), h.cast(UInt(16), FILTERED)) # Schedule vector_width = 8 disp.compute_root() \ .tile(x, y, xo, yo, xi, yi, x_tile_size, y_tile_size).reorder(xi, yi, xo, yo) \ .vectorize(xi, vector_width).parallel(xo).parallel(yo) # reorder storage disp_left.reorder_storage(xi, yi, xo, yo) diff_T.reorder_storage(xi, yi, xo, yo, d) vsum.reorder_storage(xi, yi, xo, yo, d) cSAD.reorder_storage(xi, yi, xo, yo, d) disp_left.compute_at(disp, xo).reorder(xi, yi, xo, yo) \ .vectorize(xi, vector_width) \ .update() \ .reorder(xi, yi, rd, xo, yo).vectorize(xi, vector_width) if test: cSAD.compute_at(disp_left, rd).reorder(xi, yi, xo, yo, d).vectorize(xi, vector_width) vsum.compute_at(disp_left, rd).reorder(xi, yi, xo, yo, d).vectorize(xi, vector_width) else: cSAD.compute_at(disp_left, rd).reorder(xi, yi, xo, yo, d).vectorize(xi, vector_width) \ .update() \ .reorder(yi, rxi, xo, yo, d).vectorize(yi, vector_width) vsum.compute_at(disp_left, rd).reorder(xi, yi, xo, yo, d).vectorize(xi, vector_width) \ .update() \ .reorder(xi, ryi, xo, yo, d).vectorize(xi, vector_width) return disp
def main(): # So far Funcs (such as the one below) have evaluated to a single # scalar value for each point in their domain. single_valued = hl.Func() x, y = hl.Var("x"), hl.Var("y") single_valued[x, y] = x + y # One way to write a hl.Func that returns a collection of values is # to add an additional dimension which indexes that # collection. This is how we typically deal with color. For # example, the hl.Func below represents a collection of three values # for every x, y coordinate indexed by c. color_image = hl.Func() c = hl.Var("c") color_image[x, y, c] = hl.select(c == 0, 245, # Red value c == 1, 42, # Green value 132) # Blue value # This method is often convenient because it makes it easy to # operate on this hl.Func in a way that treats each item in the # collection equally: brighter = hl.Func() brighter[x, y, c] = color_image[x, y, c] + 10 # However this method is also inconvenient for three reasons. # # 1) Funcs are defined over an infinite domain, so users of this # hl.Func can for example access color_image(x, y, -17), which is # not a meaningful value and is probably indicative of a bug. # # 2) It requires a hl.select, which can impact performance if not # bounded and unrolled: # brighter.bound(c, 0, 3).unroll(c) # # 3) With this method, all values in the collection must have the # same type. While the above two issues are merely inconvenient, # this one is a hard limitation that makes it impossible to # express certain things in this way. # It is also possible to represent a collection of values as a # collection of Funcs: func_array = [hl.Func() for i in range(3)] func_array[0][x, y] = x + y func_array[1][x, y] = hl.sin(x) func_array[2][x, y] = hl.cos(y) # This method avoids the three problems above, but introduces a # new annoyance. Because these are separate Funcs, it is # difficult to schedule them so that they are all computed # together inside a single loop over x, y. # A third alternative is to define a hl.Func as evaluating to a # Tuple instead of an hl.Expr. A Tuple is a fixed-size collection of # Exprs which may have different type. The following function # evaluates to an integer value (x+y), and a floating point value # (hl.sin(x*y)). multi_valued = hl.Func("multi_valued") multi_valued[x, y] = (x + y, hl.sin(x * y)) # Realizing a tuple-valued hl.Func returns a collection of # Buffers. We call this a Realization. It's equivalent to a # std::vector of hl.Buffer/Image objects: if True: (im1, im2) = multi_valued.realize(80, 60) assert type(im1) is hl.Buffer_int32 assert type(im2) is hl.Buffer_float32 assert im1(30, 40) == 30 + 40 assert numpy.isclose(im2(30, 40), math.sin(30 * 40)) # All Tuple elements are evaluated together over the same domain # in the same loop nest, but stored in distinct allocations. The # equivalent C++ code to the above is: if True: multi_valued_0 = numpy.empty((80*60), dtype=numpy.int32) multi_valued_1 = numpy.empty((80*60), dtype=numpy.int32) for yy in range(80): for xx in range(60): multi_valued_0[xx + 60*yy] = xx + yy multi_valued_1[xx + 60*yy] = math.sin(xx*yy) # When compiling ahead-of-time, a Tuple-valued hl.Func evaluates # into multiple distinct output buffer_t structs. These appear in # order at the end of the function signature: # int multi_valued(...input buffers and params..., buffer_t *output_1, buffer_t *output_2) # You can construct a Tuple by passing multiple Exprs to the # Tuple constructor as we did above. Perhaps more elegantly, you # can also take advantage of C++11 initializer lists and just # enclose your Exprs in braces: multi_valued_2 = hl.Func("multi_valued_2") multi_valued_2[x, y] = (x + y, hl.sin(x * y)) # Calls to a multi-valued hl.Func cannot be treated as Exprs. The # following is a syntax error: # hl.Func consumer # consumer[x, y] = multi_valued_2[x, y] + 10 # Instead you must index the returned object with square brackets # to retrieve the individual Exprs: integer_part = multi_valued_2[x, y][0] floating_part = multi_valued_2[x, y][1] assert type(integer_part) is hl.FuncTupleElementRef assert type(floating_part) is hl.FuncTupleElementRef consumer = hl.Func() consumer[x, y] = (integer_part + 10, floating_part + 10.0) # Tuple reductions. if True: # Tuples are particularly useful in reductions, as they allow # the reduction to maintain complex state as it walks along # its domain. The simplest example is an argmax. # First we create an Image to take the argmax over. input_func = hl.Func() input_func[x] = hl.sin(x) input = input_func.realize(100) assert type(input) is hl.Buffer_float32 # Then we defined a 2-valued Tuple which tracks the maximum value # its index. arg_max = hl.Func() # Pure definition. # (using [()] for zero-dimensional Funcs is a convention of this python interface) arg_max[()] = (0, input(0)) # Update definition. r = hl.RDom(1, 99) old_index = arg_max[()][0] old_max = arg_max[()][1] new_index = hl.select(old_max > input[r], r, old_index) new_max = hl.max(input[r], old_max) arg_max[()] = (new_index, new_max) # The equivalent C++ is: arg_max_0 = 0 arg_max_1 = float(input(0)) for r in range(1, 100): old_index = arg_max_0 old_max = arg_max_1 new_index = r if (old_max > input(r)) else old_index new_max = max(input(r), old_max) # In a tuple update definition, all loads and computation # are done before any stores, so that all Tuple elements # are updated atomically with respect to recursive calls # to the same hl.Func. arg_max_0 = new_index arg_max_1 = new_max # Let's verify that the Halide and C++ found the same maximum # value and index. if True: (r0, r1) = arg_max.realize() assert type(r0) is hl.Buffer_int32 assert type(r1) is hl.Buffer_float32 assert arg_max_0 == r0(0) assert numpy.isclose(arg_max_1, r1(0)) # Halide provides argmax and hl.argmin as built-in reductions # similar to sum, product, maximum, and minimum. They return # a Tuple consisting of the point in the reduction domain # corresponding to that value, and the value itself. In the # case of ties they return the first value found. We'll use # one of these in the following section. # Tuples for user-defined types. if True: # Tuples can also be a convenient way to represent compound # objects such as complex numbers. Defining an object that # can be converted to and from a Tuple is one way to extend # Halide's type system with user-defined types. class Complex: def __init__(self, r, i=None): if type(r) is float and type(i) is float: self.real = hl.Expr(r) self.imag = hl.Expr(i) elif i is not None: self.real = r self.imag = i else: self.real = r[0] self.imag = r[1] def as_tuple(self): "Convert to a Tuple" return (self.real, self.imag) def __add__(self, other): "Complex addition" return Complex(self.real + other.real, self.imag + other.imag) def __mul__(self, other): "Complex multiplication" return Complex(self.real * other.real - self.imag * other.imag, self.real * other.imag + self.imag * other.real) def __getitem__(self, idx): return (self.real, self.imag)[idx] def __len__(self): return 2 def magnitude(self): "Complex magnitude" return (self.real * self.real) + (self.imag * self.imag) # Other complex operators would go here. The above are # sufficient for this example. # Let's use the Complex struct to compute a Mandelbrot set. mandelbrot = hl.Func() # The initial complex value corresponding to an x, y coordinate # in our hl.Func. initial = Complex(x/15.0 - 2.5, y/6.0 - 2.0) # Pure definition. t = hl.Var("t") mandelbrot[x, y, t] = Complex(0.0, 0.0) # We'll use an update definition to take 12 steps. r = hl.RDom(1, 12) current = Complex(mandelbrot[x, y, r-1]) # The following line uses the complex multiplication and # addition we defined above. mandelbrot[x, y, r] = (Complex(current*current) + initial) # We'll use another tuple reduction to compute the iteration # number where the value first escapes a circle of radius 4. # This can be expressed as an hl.argmin of a boolean - we want # the index of the first time the given boolean expression is # false (we consider false to be less than true). The argmax # would return the index of the first time the expression is # true. escape_condition = Complex(mandelbrot[x, y, r]).magnitude() < 16.0 first_escape = hl.argmin(escape_condition) assert type(first_escape) is tuple # We only want the index, not the value, but hl.argmin returns # both, so we'll index the hl.argmin Tuple expression using # square brackets to get the hl.Expr representing the index. escape = hl.Func() escape[x, y] = first_escape[0] # Realize the pipeline and print the result as ascii art. result = escape.realize(61, 25) assert type(result) is hl.Buffer_int32 code = " .:-~*={&%#@" for yy in range(result.height()): for xx in range(result.width()): index = result(xx, yy) if index < len(code): print("%c" % code[index], end="") else: pass # is lesson 13 cpp version buggy ? print("") print("Success!") return 0
def get_bilateral_grid(input, r_sigma, s_sigma): x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') xi = hl.Var("xi") yi = hl.Var("yi") zi = hl.Var("zi") # Add a boundary condition clamped = hl.BoundaryConditions.repeat_edge(input) # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] val = hl.clamp(val, 0.0, 1.0) zi = hl.i32(val / r_sigma + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0) # Blur the histogram using a five-tap filter blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz') blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c] blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c] blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c] # Take trilinear samples to compute the output val = hl.clamp(clamped[x, y], 0.0, 1.0) zv = val / r_sigma zi = hl.i32(zv) zf = zv - zi xf = hl.f32(x % s_sigma) / s_sigma yf = hl.f32(y % s_sigma) / s_sigma xi = x / s_sigma yi = y / s_sigma interpolated = hl.Func('interpolated') interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf), hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf), hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf), hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf) # Normalize bilateral_grid = hl.Func('bilateral_grid') bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1] target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU schedule # Currently running this directly from the Python code is very slow. # Probably because of the dispatch time because generated code # is same speed as C++ generated code. print ("Compiling for GPU.") histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8); histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c) blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4) bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma) else: # CPU schedule print ("Compiling for CPU.") histogram.compute_root().parallel(z) histogram.update().reorder(c, r.x, r.y, x, y).unroll(c) blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c) blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) bilateral_grid.compute_root().parallel(y).vectorize(x, 4) return bilateral_grid
def bilateral_filter(input, width, height): print(' bilateral_filter') k = hl.Buffer(hl.Float(32), [7, 7], "gauss_kernel") k.translate([-3, -3]) weights = hl.Func("bilateral_weights") total_weights = hl.Func("bilateral_total_weights") bilateral = hl.Func("bilateral") output = hl.Func("bilateral_filter_output") x, y, dx, dy, c = hl.Var("x"), hl.Var("y"), hl.Var("dx"), hl.Var("dy"), hl.Var("c") rdom = hl.RDom([(-3, 7), (-3, 7)]) k.fill(0) k[-3, -3] = 0.000690 k[-2, -3] = 0.002646 k[-1, -3] = 0.005923 k[0, -3] = 0.007748 k[1, -3] = 0.005923 k[2, -3] = 0.002646 k[3, -3] = 0.000690 k[-3, -2] = 0.002646 k[-2, -2] = 0.010149 k[-1, -2] = 0.022718 k[0, -2] = 0.029715 k[1, -2] = 0.022718 k[2, -2] = 0.010149 k[3, -2] = 0.002646 k[-3, -1] = 0.005923 k[-2, -1] = 0.022718 k[-1, -1] = 0.050855 k[0, -1] = 0.066517 k[1, -1] = 0.050855 k[2, -1] = 0.022718 k[3, -1] = 0.005923 k[-3, 0] = 0.007748 k[-2, 0] = 0.029715 k[-1, 0] = 0.066517 k[0, 0] = 0.087001 k[1, 0] = 0.066517 k[2, 0] = 0.029715 k[3, 0] = 0.007748 k[-3, 1] = 0.005923 k[-2, 1] = 0.022718 k[-1, 1] = 0.050855 k[0, 1] = 0.066517 k[1, 1] = 0.050855 k[2, 1] = 0.022718 k[3, 1] = 0.005923 k[-3, 2] = 0.002646 k[-2, 2] = 0.010149 k[-1, 2] = 0.022718 k[0, 2] = 0.029715 k[1, 2] = 0.022718 k[2, 2] = 0.010149 k[3, 2] = 0.002646 k[-3, 3] = 0.000690 k[-2, 3] = 0.002646 k[-1, 3] = 0.005923 k[0, 3] = 0.007748 k[1, 3] = 0.005923 k[2, 3] = 0.002646 k[3, 3] = 0.000690 input_mirror = hl.BoundaryConditions.mirror_interior(input, [(0, width), (0, height)]) dist = hl.cast(hl.Float(32), hl.cast(hl.Int(32), input_mirror[x, y, c]) - hl.cast(hl.Int(32), input_mirror[x + dx, y + dy, c])) sig2 = 100 threshold = 25000 score = hl.select(hl.abs(input_mirror[x + dx, y + dy, c]) > threshold, 0, hl.exp(-dist * dist / sig2)) weights[dx, dy, x, y, c] = k[dx, dy] * score total_weights[x, y, c] = hl.sum(weights[rdom.x, rdom.y, x, y, c]) bilateral[x, y, c] = hl.sum(input_mirror[x + rdom.x, y + rdom.y, c] * weights[rdom.x, rdom.y, x, y, c]) / \ total_weights[x, y, c] output[x, y, c] = hl.cast(hl.Float(32), input[x, y, c]) output[x, y, 1] = bilateral[x, y, 1] output[x, y, 2] = bilateral[x, y, 2] weights.compute_at(output, y).vectorize(x, 16) output.compute_root().parallel(y).vectorize(x, 16) output.update(0).parallel(y).vectorize(x, 16) output.update(1).parallel(y).vectorize(x, 16) return output
def findStereoCorrespondence(left, right, SADWindowSize, minDisparity, numDisparities, xmin, xmax, ymin, ymax, x_tile_size=32, y_tile_size=32, test=False, uniquenessRatio=0.15, disp12MaxDiff=1): """ Returns Func (left: Func, right: Func) """ x, y, c, d = Var("x"), Var("y"), Var("c"), Var("d") diff = Func("diff") diff[d, x, y] = h.cast(UInt(16), h.abs(left[x, y] - right[x-d, y])) win2 = SADWindowSize/2 diff_T = Func("diff_T") xi, xo, yi, yo = Var("xi"), Var("xo"), Var("yi"), Var("yo") diff_T[d, xi, yi, xo, yo] = diff[d, xi + xo * x_tile_size + xmin, yi + yo * y_tile_size + ymin] cSAD, vsum = Func("cSAD"), Func("vsum") rk = RDom(-win2, SADWindowSize, "rk") rxi, ryi = RDom(1, x_tile_size - 1, "rxi"), RDom(1, y_tile_size - 1, "ryi") if test: vsum[d, xi, yi, xo, yo] = h.sum(diff_T[d, xi, yi+rk, xo, yo]) cSAD[d, xi, yi, xo, yo] = h.sum(vsum[d, xi+rk, yi, xo, yo]) else: vsum[d, xi, yi, xo, yo] = h.select(yi != 0, h.cast(UInt(16), 0), h.sum(diff_T[d, xi, rk, xo, yo])) vsum[d, xi, ryi, xo, yo] = vsum[d, xi, ryi-1, xo, yo] + diff_T[d, xi, ryi+win2, xo, yo] - diff_T[d, xi, ryi-win2-1, xo, yo] cSAD[d, xi, yi, xo, yo] = h.select(xi != 0, h.cast(UInt(16), 0), h.sum(vsum[d, rk, yi, xo, yo])) cSAD[d, rxi, yi, xo, yo] = cSAD[d, rxi-1, yi, xo, yo] + vsum[d, rxi+win2, yi, xo, yo] - vsum[d, rxi-win2-1, yi, xo, yo] rd = RDom(minDisparity, numDisparities) disp_left = Func("disp_left") disp_left[xi, yi, xo, yo] = h.Tuple(h.cast(UInt(16), minDisparity), h.cast(UInt(16), (2<<16)-1)) disp_left[xi, yi, xo, yo] = h.tuple_select( cSAD[rd, xi, yi, xo, yo] < disp_left[xi, yi, xo, yo][1], h.Tuple(h.cast(UInt(16), rd), cSAD[rd, xi, yi, xo, yo]), h.Tuple(disp_left[xi, yi, xo, yo])) FILTERED = -16 disp = Func("disp") disp[x, y] = h.select( # x > xmax-xmin or y > ymax-ymin, x < xmax, h.cast(UInt(16), disp_left[x % x_tile_size, y % y_tile_size, x / x_tile_size, y / y_tile_size][0]), h.cast(UInt(16), FILTERED)) # Schedule vector_width = 8 disp.compute_root() \ .tile(x, y, xo, yo, xi, yi, x_tile_size, y_tile_size).reorder(xi, yi, xo, yo) \ .vectorize(xi, vector_width).parallel(xo).parallel(yo) # reorder storage disp_left.reorder_storage(xi, yi, xo, yo) diff_T .reorder_storage(xi, yi, xo, yo, d) vsum .reorder_storage(xi, yi, xo, yo, d) cSAD .reorder_storage(xi, yi, xo, yo, d) disp_left.compute_at(disp, xo).reorder(xi, yi, xo, yo) \ .vectorize(xi, vector_width) \ .update() \ .reorder(xi, yi, rd, xo, yo).vectorize(xi, vector_width) if test: cSAD.compute_at(disp_left, rd).reorder(xi, yi, xo, yo, d).vectorize(xi, vector_width) vsum.compute_at(disp_left, rd).reorder(xi, yi, xo, yo, d).vectorize(xi, vector_width) else: cSAD.compute_at(disp_left, rd).reorder(xi, yi, xo, yo, d).vectorize(xi, vector_width) \ .update() \ .reorder(yi, rxi, xo, yo, d).vectorize(yi, vector_width) vsum.compute_at(disp_left, rd).reorder(xi, yi, xo, yo, d).vectorize(xi, vector_width) \ .update() \ .reorder(xi, ryi, xo, yo, d).vectorize(xi, vector_width) return disp
def merge_laplacian(x, y, c, merged_energy, next_energy, prev_lap, next_lap): merged_lap = mkfunc('merged_lap', merged_energy, next_energy, next_lap, prev_lap) merged_lap[x,y,c] = hl.select(merged_energy[x,y] == next_energy[x,y], next_lap[x,y,c], prev_lap[x,y,c]) return merged_lap