def histogram(x, y, c, img, w, h, hist_index): print("GET HIST ON: ", w, h) histogram = hl.Func("histogram") # Histogram buckets start as zero. histogram[hist_index] = 0 # Define a multi-dimensional reduction domain over the input image: r = hl.RDom([(0, w), (0, h)]) # For every point in the reduction domain, increment the # histogram bucket corresponding to the intensity of the # input image at that point. histogram[hl.Expr(img[r.x, r.y])] += 1 histogram.set_estimate(hist_index, 0, 255) # Get the sum of all histogram cells r = hl.RDom([(0,255)]) hist_sum = hl.Func('hist_sum') hist_sum[()] = 0.0 # Compute the sum as a 32-bit integer hist_sum[()] += histogram[r.x] # Return each histogram as a % of total color pct_hist = hl.Func('pct_hist') pct_hist[hist_index] = histogram[hist_index] / hist_sum[()] return histogram
def merge_temporal(images, alignment): weight = hl.Func("merge_temporal_weights") total_weight = hl.Func("merge_temporal_total_weights") output = hl.Func("merge_temporal_output") ix, iy, tx, ty, n = hl.Var('ix'), hl.Var('iy'), hl.Var('tx'), hl.Var('ty'), hl.Var('n') rdom0 = hl.RDom([(0, 16), (0, 16)]) rdom1 = hl.RDom([(1, images.dim(2).extent() - 1)]) imgs_mirror = hl.BoundaryConditions.mirror_interior(images, [(0, images.width()), (0, images.height())]) layer = box_down2(imgs_mirror, "merge_layer") offset = Point(alignment[tx, ty, n]).clamp(Point(MINIMUM_OFFSET, MINIMUM_OFFSET), Point(MAXIMUM_OFFSET, MAXIMUM_OFFSET)) al_x = idx_layer(tx, rdom0.x) + offset.x / 2 al_y = idx_layer(ty, rdom0.y) + offset.y / 2 ref_val = layer[idx_layer(tx, rdom0.x), idx_layer(ty, rdom0.y), 0] alt_val = layer[al_x, al_y, n] factor = 8.0 min_distance = 10 max_distance = 300 # max L1 distance, otherwise the value is not used distance = hl.sum(hl.abs(hl.cast(hl.Int(32), ref_val) - hl.cast(hl.Int(32), alt_val))) / 256 normal_distance = hl.max(1, hl.cast(hl.Int(32), distance) / factor - min_distance / factor) # Weight for the alternate frame weight[tx, ty, n] = hl.select(normal_distance > (max_distance - min_distance), 0.0, 1.0 / normal_distance) total_weight[tx, ty] = hl.sum(weight[tx, ty, rdom1]) + 1 offset = Point(alignment[tx, ty, rdom1]) al_x = idx_im(tx, ix) + offset.x al_y = idx_im(ty, iy) + offset.y ref_val = imgs_mirror[idx_im(tx, ix), idx_im(ty, iy), 0] alt_val = imgs_mirror[al_x, al_y, rdom1] # Sum all values according to their weight, and divide by total weight to obtain average output[ix, iy, tx, ty] = hl.sum(weight[tx, ty, rdom1] * alt_val / total_weight[tx, ty]) + ref_val / total_weight[ tx, ty] weight.compute_root().parallel(ty).vectorize(tx, 16) total_weight.compute_root().parallel(ty).vectorize(tx, 16) output.compute_root().parallel(ty).vectorize(ix, 32) return output
def gauss_15x15(input, name): print(' gauss_15x15') k = hl.Buffer(hl.Float(32), [15], "gauss_15x15") k.translate([-7]) rdom = hl.RDom([(-7, 15)]) k.fill(0) k[-7] = 0.004961 k[-6] = 0.012246 k[-5] = 0.026304 k[-4] = 0.049165 k[-3] = 0.079968 k[-2] = 0.113193 k[-1] = 0.139431 k[0] = 0.149464 k[7] = 0.004961 k[6] = 0.012246 k[5] = 0.026304 k[4] = 0.049165 k[3] = 0.079968 k[2] = 0.113193 k[1] = 0.139431 return gauss(input, k, rdom, name)
def test_basics3(): input = hl.ImageParam(hl.Float(32), 3, 'input') r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable s_sigma = 8 # This is passed during code generation in the C++ version x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') # Add a boundary condition clamped = hl.Func('clamped') clamped[x, y] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), 0] # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] val = hl.clamp(val, 0.0, 1.0) zi = hl.i32((val / r_sigma) + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 ss = hl.select(c == 0, val, 1.0) left = histogram[x, y, zi, c] left += 5 left += ss
def test_rdom(): x = hl.Var("x") y = hl.Var("y") diagonal = hl.Func("diagonal") diagonal[x, y] = 1 domain_width = 10 domain_height = 10 r = hl.RDom(0, domain_width, 0, domain_height) r.where(r.x <= r.y) diagonal[r.x, r.y] = 2 output = diagonal.realize(domain_width, domain_height) for iy in range(domain_height): for ix in range(domain_width): if ix <= iy: assert output(ix, iy) == 2 else: assert output(ix, iy) == 1 print("Success!") return 0
def test_basics2(): input = hl.ImageParam(hl.Float(32), 3, 'input') r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) s_sigma = 8 x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') # Add a boundary condition clamped = hl.Func('clamped') clamped[x, y] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), 0] # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val0 = clamped[x * s_sigma, y * s_sigma] val00 = clamped[x * s_sigma * hl.i32(1), y * s_sigma * hl.i32(1)] val22 = clamped[x * s_sigma - hl.i32(s_sigma // 2), y * s_sigma - hl.i32(s_sigma // 2)] val2 = clamped[x * s_sigma - s_sigma // 2, y * s_sigma - s_sigma // 2] val3 = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] try: val1 = clamped[x * s_sigma - s_sigma / 2, y * s_sigma - s_sigma / 2] except RuntimeError as e: assert 'Implicit cast from float32 to int' in str(e) else: assert False, 'Did not see expected exception!'
def test_basics2(): input = hl.ImageParam(hl.Float(32), 3, 'input') r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable s_sigma = 8 # This is passed during code generation in the C++ version x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') # Add a boundary condition clamped = hl.Func('clamped') clamped[x, y] = input[hl.clamp(x, 0, input.width()-1), hl.clamp(y, 0, input.height()-1),0] # Construct the bilateral grid r = hl.RDom(0, s_sigma, 0, s_sigma, 'r') val0 = clamped[x * s_sigma, y * s_sigma] val00 = clamped[x * s_sigma * hl.cast(hl.Int(32), 1), y * s_sigma * hl.cast(hl.Int(32), 1)] #val1 = clamped[x * s_sigma - s_sigma/2, y * s_sigma - s_sigma/2] # should fail val22 = clamped[x * s_sigma - hl.cast(hl.Int(32), s_sigma//2), y * s_sigma - hl.cast(hl.Int(32), s_sigma//2)] val2 = clamped[x * s_sigma - s_sigma//2, y * s_sigma - s_sigma//2] val3 = clamped[x * s_sigma + r.x - s_sigma//2, y * s_sigma + r.y - s_sigma//2] return
def test_basics2(): input = hl.ImageParam(hl.Float(32), 3, 'input') r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable s_sigma = 8 # This is passed during code generation in the C++ version x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') # Add a boundary condition clamped = hl.Func('clamped') clamped[x, y] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), 0] if True: print("s_sigma", s_sigma) print("s_sigma/2", s_sigma / 2) print("s_sigma//2", s_sigma // 2) print() print("x * s_sigma", x * s_sigma) print("x * 8", x * 8) print("x * 8 + 4", x * 8 + 4) print("x * 8 * 4", x * 8 * 4) print() print("x", x) print("(x * s_sigma).type()", ) print("(x * 8).type()", (x * 8).type()) print("(x * 8 + 4).type()", (x * 8 + 4).type()) print("(x * 8 * 4).type()", (x * 8 * 4).type()) print("(x * 8 / 4).type()", (x * 8 / 4).type()) print("((x * 8) * 4).type()", ((x * 8) * 4).type()) print("(x * (8 * 4)).type()", (x * (8 * 4)).type()) assert (x * 8).type() == hl.Int(32) assert (x * 8 * 4).type() == hl.Int(32) # yes this did fail at some point assert ((x * 8) / 4).type() == hl.Int(32) assert (x * (8 / 4)).type() == hl.Float(32) # under python3 division rules assert (x * (8 // 4)).type() == hl.Int(32) #assert (x * 8 // 4).type() == hl.Int(32) # not yet implemented # Construct the bilateral grid r = hl.RDom(0, s_sigma, 0, s_sigma, 'r') val0 = clamped[x * s_sigma, y * s_sigma] val00 = clamped[x * s_sigma * hl.cast(hl.Int(32), 1), y * s_sigma * hl.cast(hl.Int(32), 1)] #val1 = clamped[x * s_sigma - s_sigma/2, y * s_sigma - s_sigma/2] # should fail val22 = clamped[x * s_sigma - hl.cast(hl.Int(32), s_sigma // 2), y * s_sigma - hl.cast(hl.Int(32), s_sigma // 2)] val2 = clamped[x * s_sigma - s_sigma // 2, y * s_sigma - s_sigma // 2] val3 = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] return
def test_basics4(): # Test for f[g[r]] = ... # See https://github.com/halide/Halide/issues/4285 x = hl.Var('x') f = hl.Func('f') g = hl.Func('g') g[x] = 1 f[x] = 0.0 r = hl.RDom([(0, 100)]) f[g[r]] = 2.3 # This triggers a warning of double-to-float conversion f.compute_root() f.compile_jit()
def test_basics4(): # Test for f[g[r]] = ... # See https://github.com/halide/Halide/issues/4285 x = hl.Var('x') f = hl.Func('f') g = hl.Func('g') g[x] = 1 f[x] = 0.0 r = hl.RDom([(0, 100)]) f[g[r]] = 2.5 f.compute_root() f.compile_jit()
def box_down2(input, name): output = hl.Func(name) x, y, n = hl.Var("x"), hl.Var("y"), hl.Var('n') rdom = hl.RDom([(0, 2), (0, 2)]) output[x, y, n] = hl.cast( hl.UInt(16), hl.sum(hl.cast(hl.UInt(32), input[2 * x + rdom.x, 2 * y + rdom.y, n])) / 4) output.compute_root().parallel(y).vectorize(x, 16) return output
def align_layer(layer, prev_alignment, prev_min, prev_max): scores = hl.Func(layer.name() + "_scores") alignment = hl.Func(layer.name() + "_alignment") xi, yi, tx, ty, n = hl.Var("xi"), hl.Var("yi"), hl.Var('tx'), hl.Var( 'ty'), hl.Var('n') rdom0 = hl.RDom([(0, 16), (0, 16)]) rdom1 = hl.RDom([(-4, 8), (-4, 8)]) # Alignment of the previous (more coarse) layer scaled to this (finer) layer prev_offset = DOWNSAMPLE_RATE * Point( prev_alignment[prev_tile(tx), prev_tile(ty), n]).clamp( prev_min, prev_max) x0 = idx_layer(tx, rdom0.x) y0 = idx_layer(ty, rdom0.y) # (x,y) coordinates in the search region relative to the offset obtained from the alignment of the previous layer x = x0 + prev_offset.x + xi y = y0 + prev_offset.y + yi ref_val = layer[x0, y0, 0] # Value of reference frame (the first frame) alt_val = layer[x, y, n] # alternate frame value # L1 distance between reference frame and alternate frame d = hl.abs(hl.cast(hl.Int(32), ref_val) - hl.cast(hl.Int(32), alt_val)) scores[xi, yi, tx, ty, n] = hl.sum(d) # Alignment for each tile, where L1 distances are minimum alignment[tx, ty, n] = Point(hl.argmin(scores[rdom1.x, rdom1.y, tx, ty, n])) + prev_offset scores.compute_at(alignment, tx).vectorize(xi, 8) alignment.compute_root().parallel(ty).vectorize(tx, 16) return alignment
def deviation(x, y, c, img): _gray = gray(x, y, c, img) r = hl.RDom([(-2, 5), (-2, 5)]) avg = mkfunc('avg', _gray) avg[x,y] = 0.0 avg[x,y] += _gray[x + r.x, y + r.y] avg[x,y] = avg[x,y] / 25.0 deviation = mkfunc('deviation', avg) deviation[x,y] = 0.0 deviation[x,y] += (_gray[x + r.x, y + r.y] - avg[x,y]) ** 2 deviation[x,y] = (deviation[x,y] / 25.0) return deviation
def gauss_7x7(input, name): k = hl.Buffer(hl.Float(32), [7], "gauss_7x7_kernel") k.translate([-3]) rdom = hl.RDom([(-3, 7)]) k.fill(0) k[-3] = 0.026267 k[-2] = 0.100742 k[-1] = 0.225511 k[0] = 0.29496 k[1] = 0.225511 k[2] = 0.100742 k[3] = 0.026267 return gauss(input, k, rdom, name)
def test_atomics(): x = hl.Var('x') im = hl.Func('im') f = hl.Func('f') im[x] = (x * x) % 5 r = hl.RDom([(0, 100)]) f[x] = 0 f[hl.Expr(im[r])] += 1 f.compute_root().update().atomic().parallel(r) b = f.realize(5) ref = [0, 0, 0, 0, 0] for i in range(100): idx = (i * i) % 5 ref[idx] += 1 for i in range(5): assert (b[i] == ref[i])
def entropy(x, y, c, img, w, h, hist_index): base_gray = gray(x, y, c, img) clamped_gray = mkfunc('clamped_gray', base_gray) clamped_gray[x,y] = hl.clamp(base_gray[x,y], 0, 255) u8_gray = u8(x, y, c, clamped_gray) probabilities = histogram(x, y, c, u8_gray, w, h, hist_index) r = hl.RDom([(-2, 5), (-2, 5)]) levels = mkfunc('entropy', img) levels[x,y] = 0.0 # Add in 0.00001 to prevent -Inf's levels[x,y] += base_gray[x + r.x, y + r.y] * hl.log(probabilities[u8_gray[x + r.x, y + r.y]]+0.00001) levels[x,y] = levels[x,y] * -1.0 return levels
def tone_map(input, width, height, compression, gain): print(f'Compression: {compression}, gain: {gain}') normal_dist = hl.Func("luma_weight_distribution") grayscale = hl.Func("grayscale") output = hl.Func("tone_map_output") x, y, c, v = hl.Var("x"), hl.Var("y"), hl.Var("c"), hl.Var("v") rdom = hl.RDom([(0, 3)]) normal_dist[v] = hl.f32(hl.exp(-12.5 * hl.pow(hl.f32(v) / 65535 - 0.5, 2))) grayscale[x, y] = hl.u16(hl.sum(hl.u32(input[x, y, rdom])) / 3) dark = grayscale comp_const = 1 gain_const = 1 comp_slope = (compression - comp_const) / (TONE_MAP_PASSES) gain_slope = (gain - gain_const) / (TONE_MAP_PASSES) for i in range(TONE_MAP_PASSES): print(' pass', i) norm_comp = i * comp_slope + comp_const norm_gain = i * gain_slope + gain_const bright = brighten(dark, norm_comp) dark_gamma = gamma_correct(dark) bright_gamma = gamma_correct(bright) dark_gamma = combine2(dark_gamma, bright_gamma, width, height, normal_dist) dark = brighten(gamma_inverse(dark_gamma), norm_gain) output[x, y, c] = hl.u16_sat(hl.u32(input[x, y, c]) * hl.u32(dark[x, y]) / hl.u32(hl.max(1, grayscale[x, y]))) grayscale.compute_root().parallel(y).vectorize(x, 16) normal_dist.compute_root().vectorize(v, 16) return output
def test_basics5(): # Test Func.in_() x, y = hl.Var('x'), hl.Var('y') f = hl.Func('f') g = hl.Func('g') h = hl.Func('h') f[x, y] = y r = hl.RDom([(0, 100)]) g[x] = 0 g[x] += f[x, r] h[x] = 0 h[x] += f[x, r] f.in_(g).compute_at(g, x) f.in_(h).compute_at(h, x) g.compute_root() h.compute_root() p = hl.Pipeline([g, h]) p.compile_jit()
def gaussian_down4(input, name): output = hl.Func(name) k = hl.Func(name + "_filter") x, y, n = hl.Var("x"), hl.Var("y"), hl.Var('n') rdom = hl.RDom([(-2, 5), (-2, 5)]) k[x, y] = 0 k[-2, -2] = 2 k[-1, -2] = 4 k[0, -2] = 5 k[1, -2] = 4 k[2, -2] = 2 k[-2, -1] = 4 k[-1, -1] = 9 k[0, -1] = 12 k[1, -1] = 9 k[2, -1] = 4 k[-2, 0] = 5 k[-1, 0] = 12 k[0, 0] = 15 k[1, 0] = 12 k[2, 0] = 5 k[-2, 1] = 4 k[-1, 1] = 9 k[0, 1] = 12 k[1, 1] = 9 k[2, 1] = 4 k[-2, 2] = 2 k[-1, 2] = 4 k[0, 2] = 5 k[1, 2] = 4 k[2, 2] = 2 output[x, y, n] = hl.cast( hl.UInt(16), hl.sum( hl.cast( hl.UInt(32), input[4 * x + rdom.x, 4 * y + rdom.y, n] * k[rdom.x, rdom.y])) / 159) k.compute_root().parallel(y).parallel(x) output.compute_root().parallel(y).vectorize(x, 16) return output
def test_rdom(): x = hl.Var("x") y = hl.Var("y") diagonal = hl.Func("diagonal") diagonal[x, y] = 1 domain_width = 10 domain_height = 10 r = hl.RDom([(0, domain_width), (0, domain_height)]) r.where(r.x <= r.y) diagonal[r.x, r.y] += 2 output = diagonal.realize(domain_width, domain_height) for iy in range(domain_height): for ix in range(domain_width): if ix <= iy: assert output[ix, iy] == 3 else: assert output[ix, iy] == 1 assert r.x.name() == r[0].name() assert r.y.name() == r[1].name() try: r[-1].name() raise Exception("underflowing index should raise KeyError") except KeyError: pass try: r[2].name() raise Exception("overflowing index should raise KeyError") except KeyError: pass try: r["foo"].name() raise Exception("bad index type should raise TypeError") except TypeError: pass return 0
def gen_outputs(self): ''' define the outputs ''' nbfn = self.nbfn i, j = [self.vars[c] for c in "ij"] g_fock = self.funcs["g_fock"] g_dens = self.clamps["g_dens"] # output scalars rv = hl.Func("rv") # output matrix g_fock_out = hl.Func("g_fock_out") self.funcs.update({"rv": rv, "g_fock_out": g_fock_out}) self.outputs.update({"rv": rv, "g_fock_out": g_fock_out}) g_fock_out[i, j] = g_fock[i, j] rv[i] = hl.f64(0.0) r_rv = hl.RDom([(0, nbfn), (0, nbfn)]) rv[0] += g_fock[r_rv] * g_dens[r_rv] rv[0] *= hl.f64(0.5)
def srgb(input, ccm): srgb_matrix = hl.Func("srgb_matrix") output = hl.Func("srgb_output") x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") rdom = hl.RDom([(0, 3)]) srgb_matrix[x, y] = hl.f32(0) srgb_matrix[0, 0] = hl.f32(ccm[0][0]) srgb_matrix[1, 0] = hl.f32(ccm[0][1]) srgb_matrix[2, 0] = hl.f32(ccm[0][2]) srgb_matrix[0, 1] = hl.f32(ccm[1][0]) srgb_matrix[1, 1] = hl.f32(ccm[1][1]) srgb_matrix[2, 1] = hl.f32(ccm[1][2]) srgb_matrix[0, 2] = hl.f32(ccm[2][0]) srgb_matrix[1, 2] = hl.f32(ccm[2][1]) srgb_matrix[2, 2] = hl.f32(ccm[2][2]) output[x, y, c] = hl.u16_sat(hl.sum(srgb_matrix[rdom, c] * input[x, y, rdom])) return output
def white_balance(input, width, height, white_balance_r, white_balance_g0, white_balance_g1, white_balance_b): output = hl.Func("white_balance_output") print(width, height, white_balance_r, white_balance_g0, white_balance_g1, white_balance_b) x, y = hl.Var("x"), hl.Var("y") rdom = hl.RDom([(0, width / 2), (0, height / 2)]) output[x, y] = hl.u16(0) output[rdom.x * 2, rdom.y * 2] = hl.u16_sat(white_balance_r * hl.f32(input[rdom.x * 2, rdom.y * 2])) output[rdom.x * 2 + 1, rdom.y * 2] = hl.u16_sat(white_balance_g0 * hl.f32(input[rdom.x * 2 + 1, rdom.y * 2])) output[rdom.x * 2, rdom.y * 2 + 1] = hl.u16_sat(white_balance_g1 * hl.f32(input[rdom.x * 2, rdom.y * 2 + 1])) output[rdom.x * 2 + 1, rdom.y * 2 + 1] = hl.u16_sat(white_balance_b * hl.f32(input[rdom.x * 2 + 1, rdom.y * 2 + 1])) output.compute_root().parallel(y).vectorize(x, 16) output.update(0).parallel(rdom.y) output.update(1).parallel(rdom.y) output.update(2).parallel(rdom.y) output.update(3).parallel(rdom.y) return output
def test_rdom(): x = hl.Var("x") y = hl.Var("y") diagonal = hl.Func("diagonal") diagonal[x, y] = 1 domain_width = 10 domain_height = 10 r = hl.RDom([(0, domain_width), (0, domain_height)]) r.where(r.x <= r.y) diagonal[r.x, r.y] += 2 output = diagonal.realize(domain_width, domain_height) for iy in range(domain_height): for ix in range(domain_width): if ix <= iy: assert output[ix, iy] == 3 else: assert output[ix, iy] == 1 return 0
def main(): # Declare some Vars to use below. x, y = hl.Var("x"), hl.Var("y") # Load a grayscale image to use as an input. image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/gray.png") input_data = imread(image_path) if True: # making the image smaller to go faster input_data = input_data[:160, :150] assert input_data.dtype == np.uint8 input = hl.Buffer(input_data) # You can define a hl.Func in multiple passes. Let's see a toy # example first. if True: # The first definition must be one like we have seen already # - a mapping from Vars to an hl.Expr: f = hl.Func("f") f[x, y] = x + y # We call this first definition the "pure" definition. # But the later definitions can include computed expressions on # both sides. The simplest example is modifying a single point: f[3, 7] = 42 # We call these extra definitions "update" definitions, or # "reduction" definitions. A reduction definition is an # update definition that recursively refers back to the # function's current value at the same site: if False: e = f[x, y] + 17 print("f[x, y] + 17", e) print("(f[x, y] + 17).type()", e.type()) print("(f[x, y]).type()", f[x, y].type()) f[x, y] = f[x, y] + 17 # If we confine our update to a single row, we can # recursively refer to values in the same column: f[x, 3] = f[x, 0] * f[x, 10] # Similarly, if we confine our update to a single column, we # can recursively refer to other values in the same row. f[0, y] = f[0, y] / f[3, y] # The general rule is: Each hl.Var used in an update definition # must appear unadorned in the same position as in the pure # definition in all references to the function on the left- # and right-hand sides. So the following definitions are # legal updates: f[x, 17] = x + 8 # x is used, so all uses of f must have x as the first argument. f[0, y] = y * 8 # y is used, so all uses of f must have y as the second argument. f[x, x + 1] = x + 8 f[y / 2, y] = f[0, y] * 17 # But these ones would cause an error: # f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'. # f[y, y + 1) = y + 8 <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'. # f[y, x) = y - x <- Arguments to f on the left-hand-side are in the wrong places. # f[3, 4) = x + y <- Free variables appear on the right-hand-side but not the left-hand-side. # We'll realize this one just to make sure it compiles. The # second-to-last definition forces us to realize over a # domain that is taller than it is wide. f.realize(100, 101) # For each realization of f, each step runs in its entirety # before the next one begins. Let's trace the loads and # stores for a simpler example: g = hl.Func("g") g[x, y] = x + y # Pure definition g[2, 1] = 42 # First update definition g[x, 0] = g[x, 1] # Second update definition g.trace_loads() g.trace_stores() g.realize(4, 4) # Reading the log, we see that each pass is applied in turn. The equivalent C is: result = np.empty((4, 4), dtype=np.int) # Pure definition for yy in range(4): for xx in range(4): result[yy][xx] = xx + yy # First update definition result[1][2] = 42 # Second update definition for xx in range(4): result[0][xx] = result[1][xx] # end of section # Putting update passes inside loops. if True: # Starting with this pure definition: f = hl.Func("f") f[x, y] = x + y # Say we want an update that squares the first fifty rows. We # could do this by adding 50 update definitions: # f[x, 0) = f[x, 0) * f[x, 0) # f[x, 1) = f[x, 1) * f[x, 1) # f[x, 2) = f[x, 2) * f[x, 2) # ... # f[x, 49) = f[x, 49) * f[x, 49) # Or equivalently using a compile-time loop in our C++: # for (int i = 0 i < 50 i++) { # f[x, i) = f[x, i) * f[x, i) # # But it's more manageable and more flexible to put the loop # in the generated code. We do this by defining a "reduction # domain" and using it inside an update definition: r = hl.RDom(0, 50) f[x, r] = f[x, r] * f[x, r] halide_result = f.realize(100, 100) # The equivalent C is: c_result = np.empty((100, 100), dtype=np.int) for yy in range(100): for xx in range(100): c_result[yy][xx] = xx + yy for xx in range(100): for rr in range(50): # The loop over the reduction domain occurs inside of # the loop over any pure variables used in the update # step: c_result[rr][xx] = c_result[rr][xx] * c_result[rr][xx] # Check the results match: for yy in range(100): for xx in range(100): if halide_result(xx, yy) != c_result[yy][xx]: raise Exception( "halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result(xx, yy), c_result[yy][xx])) return -1 # Now we'll examine a real-world use for an update definition: # computing a histogram. if True: # Some operations on images can't be cleanly expressed as a pure # function from the output coordinates to the value stored # there. The classic example is computing a histogram. The # natural way to do it is to iterate over the input image, # updating histogram buckets. Here's how you do that in Halide: histogram = hl.Func("histogram") # Histogram buckets start as zero. histogram[x] = 0 # Define a multi-dimensional reduction domain over the input image: r = hl.RDom(0, input.width(), 0, input.height()) # For every point in the reduction domain, increment the # histogram bucket corresponding to the intensity of the # input image at that point. histogram[input[r.x, r.y]] += 1 halide_result = histogram.realize(256) # The equivalent C is: c_result = np.empty((256), dtype=np.int) for xx in range(256): c_result[xx] = 0 for r_y in range(input.height()): for r_x in range(input.width()): c_result[input_data[r_x, r_y]] += 1 # Check the answers agree: for xx in range(256): if c_result[xx] != halide_result(xx): raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 # Scheduling update steps if True: # The pure variables in an update step and can be # parallelized, vectorized, split, etc as usual. # Vectorizing, splitting, or parallelize the variables that # are part of the reduction domain is trickier. We'll cover # that in a later lesson. # Consider the definition: f = hl.Func("x") f[x, y] = x * y # Set the second row to equal the first row. f[x, 1] = f[x, 0] # Set the second column to equal the first column plus 2. f[1, y] = f[0, y] + 2 # The pure variables in each stage can be scheduled # independently. To control the pure definition, we schedule # as we have done in the past. The following code vectorizes # and parallelizes the pure definition only. f.vectorize(x, 4).parallel(y) # We use hl.Func::update(int) to get a handle to an update step # for the purposes of scheduling. The following line # vectorizes the first update step across x. We can't do # anything with y for this update step, because it doesn't # use y. f.update(0).vectorize(x, 4) # Now we parallelize the second update step in chunks of size # 4. yo, yi = hl.Var("yo"), hl.Var("yi") f.update(1).split(y, yo, yi, 4).parallel(yo) halide_result = f.realize(16, 16) # Here's the equivalent (serial) C: c_result = np.empty((16, 16), dtype=np.int) # Pure step. Vectorized in x and parallelized in y. for yy in range(16): # Should be a parallel for loop for x_vec in range(4): xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3] c_result[yy][xx[0]] = xx[0] * yy c_result[yy][xx[1]] = xx[1] * yy c_result[yy][xx[2]] = xx[2] * yy c_result[yy][xx[3]] = xx[3] * yy # First update. Vectorized in x. for x_vec in range(4): xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3] c_result[1][xx[0]] = c_result[0][xx[0]] c_result[1][xx[1]] = c_result[0][xx[1]] c_result[1][xx[2]] = c_result[0][xx[2]] c_result[1][xx[3]] = c_result[0][xx[3]] # Second update. Parallelized in chunks of size 4 in y. for yo in range(4): # Should be a parallel for loop for yi in range(4): yy = yo * 4 + yi c_result[yy][1] = c_result[yy][0] + 2 # Check the C and Halide results match: for yy in range(16): for xx in range(16): if halide_result(xx, yy) != c_result[yy][xx]: raise Exception( "halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result(xx, yy), c_result[yy][xx])) return -1 # That covers how to schedule the variables within a hl.Func that # uses update steps, but what about producer-consumer # relationships that involve compute_at and store_at? Let's # examine a reduction as a producer, in a producer-consumer pair. if True: # Because an update does multiple passes over a stored array, # it's not meaningful to inline them. So the default schedule # for them does the closest thing possible. It computes them # in the innermost loop of their consumer. Consider this # trivial example: producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 producer[x] += 1 consumer[x] = 2 * producer[x] halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) for xx in range(10): producer_storage = np.empty((1), dtype=np.int) # Pure step for producer producer_storage[0] = xx * 17 # Update step for producer producer_storage[0] = producer_storage[0] + 1 # Pure step for consumer c_result[xx] = 2 * producer_storage[0] # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 # For all other compute_at/store_at options, the reduction # gets placed where you would expect, somewhere in the loop # nest of the consumer. # Now let's consider a reduction as a consumer in a # producer-consumer pair. This is a little more involved. if True: if True: # Case 1: The consumer references the producer in the pure step only. producer, consumer = hl.Func("producer"), hl.Func("consumer") # The producer is pure. producer[x] = x * 17 consumer[x] = 2 * producer[x] consumer[x] += 1 # The valid schedules for the producer in this case are # the default schedule - inlined, and also: # # 1) producer.compute_at(x), which places the computation of # the producer inside the loop over x in the pure step of the # consumer. # # 2) producer.compute_root(), which computes all of the # producer ahead of time. # # 3) producer.store_root().compute_at(x), which allocates # space for the consumer outside the loop over x, but fills # it in as needed inside the loop. # # Let's use option 1. producer.compute_at(consumer, x) halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] = 2 * producer_storage[0] # Update step for the consumer for xx in range(10): c_result[xx] += 1 # All of the pure step is evaluated before any of the # update step, so there are two separate loops over x. # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 if True: # Case 2: The consumer references the producer in the update step only producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 consumer[x] = x consumer[x] += producer[x] # Again we compute the producer per x coordinate of the # consumer. This places producer code inside the update # step of the producer, because that's the only step that # uses the producer. producer.compute_at(consumer, x) # Note however, that we didn't say: # # producer.compute_at(consumer.update(0), x). # # Scheduling is done with respect to Vars of a hl.Func, and # the Vars of a hl.Func are shared across the pure and # update steps. halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): c_result[xx] = xx # Update step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] += producer_storage[0] # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 if True: # Case 3: The consumer references the producer in # multiple steps that share common variables producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 consumer[x] = producer[x] * x consumer[x] += producer[x] # Again we compute the producer per x coordinate of the # consumer. This places producer code inside both the # pure and the update step of the producer. So there ends # up being two separate realizations of the producer, and # redundant work occurs. producer.compute_at(consumer, x) halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] = producer_storage[0] * xx # Update step for the consumer for xx in range(10): # Another copy of the pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 c_result[xx] += producer_storage[0] # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 if True: # Case 4: The consumer references the producer in # multiple steps that do not share common variables producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x, y] = x * y consumer[x, y] = x + y consumer[x, 0] = producer[x, x - 1] consumer[0, y] = producer[y, y - 1] # In this case neither producer.compute_at(consumer, x) # nor producer.compute_at(consumer, y) will work, because # either one fails to cover one of the uses of the # producer. So we'd have to inline producer, or use # producer.compute_root(). # Let's say we really really want producer to be # compute_at the inner loops of both consumer update # steps. Halide doesn't allow multiple different # schedules for a single hl.Func, but we can work around it # by making two wrappers around producer, and scheduling # those instead: # Attempt 2: producer_wrapper_1, producer_wrapper_2, consumer_2 = hl.Func( ), hl.Func(), hl.Func() producer_wrapper_1[x, y] = producer[x, y] producer_wrapper_2[x, y] = producer[x, y] consumer_2[x, y] = x + y consumer_2[x, 0] += producer_wrapper_1[x, x - 1] consumer_2[0, y] += producer_wrapper_2[y, y - 1] # The wrapper functions give us two separate handles on # the producer, so we can schedule them differently. producer_wrapper_1.compute_at(consumer_2, x) producer_wrapper_2.compute_at(consumer_2, y) halide_result = consumer_2.realize(10, 10) # The equivalent C is: c_result = np.empty((10, 10), dtype=np.int) # Pure step for the consumer for yy in range(10): for xx in range(10): c_result[yy][xx] = xx + yy # First update step for consumer for xx in range(10): producer_wrapper_1_storage = np.empty((1), dtype=np.int) producer_wrapper_1_storage[0] = xx * (xx - 1) c_result[0][xx] += producer_wrapper_1_storage[0] # Second update step for consumer for yy in range(10): producer_wrapper_2_storage = np.empty((1), dtype=np.int) producer_wrapper_2_storage[0] = yy * (yy - 1) c_result[yy][0] += producer_wrapper_2_storage[0] # Check the results match for yy in range(10): for xx in range(10): if halide_result(xx, yy) != c_result[yy][xx]: print("halide_result(%d, %d) = %d instead of %d", xx, yy, halide_result(xx, yy), c_result[yy][xx]) return -1 if True: # Case 5: Scheduling a producer under a reduction domain # variable of the consumer. # We are not just restricted to scheduling producers at # the loops over the pure variables of the consumer. If a # producer is only used within a loop over a reduction # domain (hl.RDom) variable, we can also schedule the # producer there. producer, consumer = hl.Func("producer"), hl.Func("consumer") r = hl.RDom(0, 5) producer[x] = x * 17 consumer[x] = x + 10 consumer[x] += r + producer[x + r] producer.compute_at(consumer, r) halide_result = consumer.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) # Pure step for the consumer. for xx in range(10): c_result[xx] = xx + 10 # Update step for the consumer. for xx in range(10): for rr in range( 5 ): # The loop over the reduction domain is always the inner loop. # We've schedule the storage and computation of # the producer here. We just need a single value. producer_storage = np.empty((1), dtype=np.int) # Pure step of the producer. producer_storage[0] = (xx + rr) * 17 # Now use it in the update step of the consumer. c_result[xx] += rr + producer_storage[0] # Check the results match for xx in range(10): if halide_result(xx) != c_result[xx]: raise Exception("halide_result(%d) = %d instead of %d" % (xx, halide_result(xx), c_result[xx])) return -1 # A real-world example of a reduction inside a producer-consumer chain. if True: # The default schedule for a reduction is a good one for # convolution-like operations. For example, the following # computes a 5x5 box-blur of our grayscale test image with a # hl.clamp-to-edge boundary condition: # First add the boundary condition. clamped = hl.repeat_edge(input) # Define a 5x5 box that starts at (-2, -2) r = hl.RDom(-2, 5, -2, 5) # Compute the 5x5 sum around each pixel. local_sum = hl.Func("local_sum") local_sum[x, y] = 0 # Compute the sum as a 32-bit integer local_sum[x, y] += clamped[x + r.x, y + r.y] # Divide the sum by 25 to make it an average blurry = hl.Func("blurry") blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25) halide_result = blurry.realize(input.width(), input.height()) # The default schedule will inline 'clamped' into the update # step of 'local_sum', because clamped only has a pure # definition, and so its default schedule is fully-inlined. # We will then compute local_sum per x coordinate of blurry, # because the default schedule for reductions is # compute-innermost. Here's the equivalent C: #cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0] local_sum = np.empty((1), dtype=np.int32) c_result = hl.Buffer(hl.UInt(8), input.width(), input.height()) for yy in range(input.height()): for xx in range(input.width()): # FIXME this loop is quite slow # Pure step of local_sum local_sum[0] = 0 # Update step of local_sum for r_y in range(-2, 2 + 1): for r_x in range(-2, 2 + 1): # The clamping has been inlined into the update step. clamped_x = min(max(xx + r_x, 0), input.width() - 1) clamped_y = min(max(yy + r_y, 0), input.height() - 1) local_sum[0] += input(clamped_x, clamped_y) # Pure step of blurry #c_result(x, y) = (uint8_t)(local_sum[0] / 25) #c_result[xx, yy] = cast_to_uint8(local_sum[0] / 25) c_result[xx, yy] = int(local_sum[0] / 25) # hl.cast done internally # Check the results match for yy in range(input.height()): for xx in range(input.width()): if halide_result(xx, yy) != c_result(xx, yy): raise Exception( "halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result(xx, yy), c_result(xx, yy))) return -1 # Reduction helpers. if True: # There are several reduction helper functions provided in # Halide.h, which compute small reductions and schedule them # innermost into their consumer. The most useful one is # "sum". f1 = hl.Func("f1") r = hl.RDom(0, 100) f1[x] = hl.sum(r + x) * 7 # Sum creates a small anonymous hl.Func to do the reduction. It's equivalent to: f2, anon = hl.Func("f2"), hl.Func("anon") anon[x] = 0 anon[x] += r + x f2[x] = anon[x] * 7 # So even though f1 references a reduction domain, it is a # pure function. The reduction domain has been swallowed to # define the inner anonymous reduction. halide_result_1 = f1.realize(10) halide_result_2 = f2.realize(10) # The equivalent C is: c_result = np.empty((10), dtype=np.int) for xx in range(10): anon = np.empty((1), dtype=np.int) anon[0] = 0 for rr in range(100): anon[0] += rr + xx c_result[xx] = anon[0] * 7 # Check they all match. for xx in range(10): if halide_result_1(xx) != c_result[xx]: print("halide_result_1(%d) = %d instead of %d", x, halide_result_1(x), c_result[x]) return -1 if halide_result_2(xx) != c_result[xx]: print("halide_result_2(%d) = %d instead of %d", x, halide_result_2(x), c_result[x]) return -1 # A complex example that uses reduction helpers. if False: # non-sense to port SSE code to python, skipping this test # Other reduction helpers include "product", "minimum", # "maximum", "hl.argmin", and "argmax". Using hl.argmin and argmax # requires understanding tuples, which come in a later # lesson. Let's use minimum and maximum to compute the local # spread of our grayscale image. # First, add a boundary condition to the input. clamped = hl.Func("clamped") x_clamped = hl.clamp(x, 0, input.width() - 1) y_clamped = hl.clamp(y, 0, input.height() - 1) clamped[x, y] = input[x_clamped, y_clamped] box = hl.RDom(-2, 5, -2, 5) # Compute the local maximum minus the local minimum: spread = hl.Func("spread") spread[x, y] = (maximum(clamped(x + box.x, y + box.y)) - minimum(clamped(x + box.x, y + box.y))) # Compute the result in strips of 32 scanlines yo, yi = hl.Var("yo"), hl.Var("yi") spread.split(y, yo, yi, 32).parallel(yo) # Vectorize across x within the strips. This implicitly # vectorizes stuff that is computed within the loop over x in # spread, which includes our minimum and maximum helpers, so # they get vectorized too. spread.vectorize(x, 16) # We'll apply the boundary condition by padding each scanline # as we need it in a circular buffer (see lesson 08). clamped.store_at(spread, yo).compute_at(spread, yi) halide_result = spread.realize(input.width(), input.height()) # The C equivalent is almost too horrible to contemplate (and # took me a long time to debug). This time I want to time # both the Halide version and the C version, so I'll use sse # intrinsics for the vectorization, and openmp to do the # parallel for loop (you'll need to compile with -fopenmp or # similar to get correct timing). #ifdef __SSE2__ # Don't include the time required to allocate the output buffer. c_result = hl.Buffer(hl.UInt(8), input.width(), input.height()) #ifdef _OPENMP t1 = datetime.now() #endif # Run this one hundred times so we can average the timing results. for iters in range(100): pass # #pragma omp parallel for # for yo in range((input.height() + 31)/32): # y_base = hl.min(yo * 32, input.height() - 32) # # # Compute clamped in a circular buffer of size 8 # # (smallest power of two greater than 5). Each thread # # needs its own allocation, so it must occur here. # # clamped_width = input.width() + 4 # clamped_storage = np.empty((clamped_width * 8), dtype=np.uint8) # # for yi in range(32): # y = y_base + yi # # uint8_t *output_row = &c_result(0, y) # # # Compute clamped for this scanline, skipping rows # # already computed within this slice. # int min_y_clamped = (yi == 0) ? (y - 2) : (y + 2) # int max_y_clamped = (y + 2) # for (int cy = min_y_clamped cy <= max_y_clamped cy++) { # # Figure out which row of the circular buffer # # we're filling in using bitmasking: # uint8_t *clamped_row = clamped_storage + (cy & 7) * clamped_width # # # Figure out which row of the input we're reading # # from by clamping the y coordinate: # int clamped_y = std::hl.min(std::hl.max(cy, 0), input.height()-1) # uint8_t *input_row = &input(0, clamped_y) # # # Fill it in with the padding. # for (int x = -2 x < input.width() + 2 ): # int clamped_x = std::hl.min(std::hl.max(x, 0), input.width()-1) # *clamped_row++ = input_row[clamped_x] # # # # # Now iterate over vectors of x for the pure step of the output. # for (int x_vec = 0 x_vec < (input.width() + 15)/16 x_vec++) { # int x_base = std::hl.min(x_vec * 16, input.width() - 16) # # # Allocate storage for the minimum and maximum # # helpers. One vector is enough. # __m128i minimum_storage, maximum_storage # # # The pure step for the maximum is a vector of zeros # maximum_storage = (__m128i)_mm_setzero_ps() # # # The update step for maximum # for (int max_y = y - 2 max_y <= y + 2 max_y++) { # uint8_t *clamped_row = clamped_storage + (max_y & 7) * clamped_width # for (int max_x = x_base - 2 max_x <= x_base + 2 max_): # __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + max_x + 2)) # maximum_storage = _mm_max_epu8(maximum_storage, v) # # # # # The pure step for the minimum is a vector of # # ones. Create it by comparing something to # # itself. # minimum_storage = (__m128i)_mm_cmpeq_ps(_mm_setzero_ps(), # _mm_setzero_ps()) # # # The update step for minimum. # for (int min_y = y - 2 min_y <= y + 2 min_y++) { # uint8_t *clamped_row = clamped_storage + (min_y & 7) * clamped_width # for (int min_x = x_base - 2 min_x <= x_base + 2 min_): # __m128i v = _mm_loadu_si128((__m128i const *)(clamped_row + min_x + 2)) # minimum_storage = _mm_min_epu8(minimum_storage, v) # # # # # Now compute the spread. # __m128i spread = _mm_sub_epi8(maximum_storage, minimum_storage) # # # Store it. # _mm_storeu_si128((__m128i *)(output_row + x_base), spread) # # # # del clamped_storage # # end of hundred iterations # Skip the timing comparison if we don't have openmp # enabled. Otherwise it's unfair to C. #ifdef _OPENMP t2 = datetime.now() # Now run the Halide version again without the # jit-compilation overhead. Also run it one hundred times. for iters in range(100): spread.realize(halide_result) t3 = datetime.now() # Report the timings. On my machine they both take about 3ms # for the 4-megapixel input (fast!), which makes sense, # because they're using the same vectorization and # parallelization strategy. However I find the Halide easier # to read, write, debug, modify, and port. print("Halide spread took %f ms. C equivalent took %f ms" % ((t3 - t2).total_seconds() * 1000, (t2 - t1).total_seconds() * 1000)) #endif # _OPENMP # Check the results match: for yy in range(input.height()): for xx in range(input.width()): if halide_result(xx, yy) != c_result(xx, yy): raise Exception( "halide_result(%d, %d) = %d instead of %d" % (xx, yy, halide_result(xx, yy), c_result(xx, yy))) return -1 #endif # __SSE2__ else: print("(Skipped the SSE2 section of the code, " "since non-sense in python world.)") print("Success!") return 0
def main(): # Declare some Vars to use below. x, y = hl.Var("x"), hl.Var("y") # Load a grayscale image to use as an input. image_path = os.path.join(os.path.dirname(__file__), "../../tutorial/images/gray.png") input_data = imageio.imread(image_path) if True: # making the image smaller to go faster input_data = input_data[:160, :150] assert input_data.dtype == np.uint8 input = hl.Buffer(input_data) # You can define a hl.Func in multiple passes. Let's see a toy # example first. if True: # The first definition must be one like we have seen already # - a mapping from Vars to an hl.Expr: f = hl.Func("f") f[x, y] = x + y # We call this first definition the "pure" definition. # But the later definitions can include computed expressions on # both sides. The simplest example is modifying a single point: f[3, 7] = 42 # We call these extra definitions "update" definitions, or # "reduction" definitions. A reduction definition is an # update definition that recursively refers back to the # function's current value at the same site: if False: e = f[x, y] + 17 print("f[x, y] + 17", e) print("(f[x, y] + 17).type()", e.type()) print("(f[x, y]).type()", f[x, y].type()) f[x, y] = f[x, y] + 17 # If we confine our update to a single row, we can # recursively refer to values in the same column: f[x, 3] = f[x, 0] * f[x, 10] # Similarly, if we confine our update to a single column, we # can recursively refer to other values in the same row. f[0, y] = f[0, y] / f[3, y] # The general rule is: Each hl.Var used in an update definition # must appear unadorned in the same position as in the pure # definition in all references to the function on the left- # and right-hand sides. So the following definitions are # legal updates: # x is used, so all uses of f must have x as the first argument. f[x, 17] = x + 8 # y is used, so all uses of f must have y as the second argument. f[0, y] = y * 8 f[x, x + 1] = x + 8 f[y / 2, y] = f[0, y] * 17 # But these ones would cause an error: # f[x, 0) = f[x + 1, 0) <- First argument to f on the right-hand-side must be 'x', not 'x + 1'. # f[y, y + 1) = y + 8 <- Second argument to f on the left-hand-side must be 'y', not 'y + 1'. # f[y, x) = y - x <- Arguments to f on the left-hand-side are in the wrong places. # f[3, 4) = x + y <- Free variables appear on the right-hand-side # but not the left-hand-side. # We'll realize this one just to make sure it compiles. The # second-to-last definition forces us to realize over a # domain that is taller than it is wide. f.realize(100, 101) # For each realization of f, each step runs in its entirety # before the next one begins. Let's trace the loads and # stores for a simpler example: g = hl.Func("g") g[x, y] = x + y # Pure definition g[2, 1] = 42 # First update definition g[x, 0] = g[x, 1] # Second update definition g.trace_loads() g.trace_stores() g.realize(4, 4) # Reading the log, we see that each pass is applied in turn. The # equivalent Python is: result = np.empty((4, 4), dtype=np.int) # Pure definition for yy in range(4): for xx in range(4): result[yy][xx] = xx + yy # First update definition result[1][2] = 42 # Second update definition for xx in range(4): result[0][xx] = result[1][xx] # end of section # Putting update passes inside loops. if True: # Starting with this pure definition: f = hl.Func("f") f[x, y] = x + y # Say we want an update that squares the first fifty rows. We # could do this by adding 50 update definitions: # f[x, 0) = f[x, 0) * f[x, 0) # f[x, 1) = f[x, 1) * f[x, 1) # f[x, 2) = f[x, 2) * f[x, 2) # ... # f[x, 49) = f[x, 49) * f[x, 49) # Or equivalently using a compile-time loop in our C++: # for (int i = 0 i < 50 i++) { # f[x, i) = f[x, i) * f[x, i) # # But it's more manageable and more flexible to put the loop # in the generated code. We do this by defining a "reduction # domain" and using it inside an update definition: r = hl.RDom([(0, 50)]) f[x, r] = f[x, r] * f[x, r] halide_result = f.realize(100, 100) # The equivalent Python is: py_result = np.empty((100, 100), dtype=np.int) for yy in range(100): for xx in range(100): py_result[yy][xx] = xx + yy for xx in range(100): for rr in range(50): # The loop over the reduction domain occurs inside of # the loop over any pure variables used in the update # step: py_result[rr][xx] = py_result[rr][xx] * py_result[rr][xx] # Check the results match: for yy in range(100): for xx in range(100): assert halide_result[xx, yy] == py_result[yy][xx], \ "halide_result(%d, %d) = %d instead of %d" % ( xx, yy, halide_result[xx, yy], py_result[yy][xx]) # Now we'll examine a real-world use for an update definition: # computing a histogram. if True: # Some operations on images can't be cleanly expressed as a pure # function from the output coordinates to the value stored # there. The classic example is computing a histogram. The # natural way to do it is to iterate over the input image, # updating histogram buckets. Here's how you do that in Halide: histogram = hl.Func("histogram") # Histogram buckets start as zero. histogram[x] = 0 # Define a multi-dimensional reduction domain over the input image: r = hl.RDom([(0, input.width()), (0, input.height())]) # For every point in the reduction domain, increment the # histogram bucket corresponding to the intensity of the # input image at that point. histogram[input[r.x, r.y]] += 1 halide_result = histogram.realize(256) # The equivalent Python is: py_result = np.empty((256), dtype=np.int) for xx in range(256): py_result[xx] = 0 for r_y in range(input.height()): for r_x in range(input.width()): py_result[input_data[r_x, r_y]] += 1 # Check the answers agree: for xx in range(256): assert py_result[xx] == halide_result[xx], \ "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx]) # Scheduling update steps if True: # The pure variables in an update step and can be # parallelized, vectorized, split, etc as usual. # Vectorizing, splitting, or parallelize the variables that # are part of the reduction domain is trickier. We'll cover # that in a later lesson. # Consider the definition: f = hl.Func("x") f[x, y] = x * y # Set the second row to equal the first row. f[x, 1] = f[x, 0] # Set the second column to equal the first column plus 2. f[1, y] = f[0, y] + 2 # The pure variables in each stage can be scheduled # independently. To control the pure definition, we schedule # as we have done in the past. The following code vectorizes # and parallelizes the pure definition only. f.vectorize(x, 4).parallel(y) # We use hl.Func::update(int) to get a handle to an update step # for the purposes of scheduling. The following line # vectorizes the first update step across x. We can't do # anything with y for this update step, because it doesn't # use y. f.update(0).vectorize(x, 4) # Now we parallelize the second update step in chunks of size # 4. yo, yi = hl.Var("yo"), hl.Var("yi") f.update(1).split(y, yo, yi, 4).parallel(yo) halide_result = f.realize(16, 16) # Here's the equivalent (serial) C: py_result = np.empty((16, 16), dtype=np.int) # Pure step. Vectorized in x and parallelized in y. for yy in range(16): # Should be a parallel for loop for x_vec in range(4): xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3] py_result[yy][xx[0]] = xx[0] * yy py_result[yy][xx[1]] = xx[1] * yy py_result[yy][xx[2]] = xx[2] * yy py_result[yy][xx[3]] = xx[3] * yy # First update. Vectorized in x. for x_vec in range(4): xx = [x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3] py_result[1][xx[0]] = py_result[0][xx[0]] py_result[1][xx[1]] = py_result[0][xx[1]] py_result[1][xx[2]] = py_result[0][xx[2]] py_result[1][xx[3]] = py_result[0][xx[3]] # Second update. Parallelized in chunks of size 4 in y. for yo in range(4): # Should be a parallel for loop for yi in range(4): yy = yo * 4 + yi py_result[yy][1] = py_result[yy][0] + 2 # Check the C and Halide results match: for yy in range(16): for xx in range(16): assert halide_result[xx, yy] == py_result[yy][xx], \ "halide_result(%d, %d) = %d instead of %d" % ( xx, yy, halide_result[xx, yy], py_result[yy][xx]) # That covers how to schedule the variables within a hl.Func that # uses update steps, but what about producer-consumer # relationships that involve compute_at and store_at? Let's # examine a reduction as a producer, in a producer-consumer pair. if True: # Because an update does multiple passes over a stored array, # it's not meaningful to inline them. So the default schedule # for them does the closest thing possible. It computes them # in the innermost loop of their consumer. Consider this # trivial example: producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 producer[x] += 1 consumer[x] = 2 * producer[x] halide_result = consumer.realize(10) # The equivalent Python is: py_result = np.empty((10), dtype=np.int) for xx in range(10): producer_storage = np.empty((1), dtype=np.int) # Pure step for producer producer_storage[0] = xx * 17 # Update step for producer producer_storage[0] = producer_storage[0] + 1 # Pure step for consumer py_result[xx] = 2 * producer_storage[0] # Check the results match for xx in range(10): assert halide_result[xx] == py_result[xx], \ "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx]) # For all other compute_at/store_at options, the reduction # gets placed where you would expect, somewhere in the loop # nest of the consumer. # Now let's consider a reduction as a consumer in a # producer-consumer pair. This is a little more involved. if True: if True: # Case 1: The consumer references the producer in the pure step # only. producer, consumer = hl.Func("producer"), hl.Func("consumer") # The producer is pure. producer[x] = x * 17 consumer[x] = 2 * producer[x] consumer[x] += 1 # The valid schedules for the producer in this case are # the default schedule - inlined, and also: # # 1) producer.compute_at(x), which places the computation of # the producer inside the loop over x in the pure step of the # consumer. # # 2) producer.compute_root(), which computes all of the # producer ahead of time. # # 3) producer.store_root().compute_at(x), which allocates # space for the consumer outside the loop over x, but fills # it in as needed inside the loop. # # Let's use option 1. producer.compute_at(consumer, x) halide_result = consumer.realize(10) # The equivalent Python is: py_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 py_result[xx] = 2 * producer_storage[0] # Update step for the consumer for xx in range(10): py_result[xx] += 1 # All of the pure step is evaluated before any of the # update step, so there are two separate loops over x. # Check the results match for xx in range(10): assert halide_result[xx] == py_result[xx], \ "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx]) if True: # Case 2: The consumer references the producer in the update step # only producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 consumer[x] = x consumer[x] += producer[x] # Again we compute the producer per x coordinate of the # consumer. This places producer code inside the update # step of the producer, because that's the only step that # uses the producer. producer.compute_at(consumer, x) # Note however, that we didn't say: # # producer.compute_at(consumer.update(0), x). # # Scheduling is done with respect to Vars of a hl.Func, and # the Vars of a hl.Func are shared across the pure and # update steps. halide_result = consumer.realize(10) # The equivalent Python is: py_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): py_result[xx] = xx # Update step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 py_result[xx] += producer_storage[0] # Check the results match for xx in range(10): assert halide_result[xx] == py_result[xx], \ "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx]) if True: # Case 3: The consumer references the producer in # multiple steps that share common variables producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x] = x * 17 consumer[x] = producer[x] * x consumer[x] += producer[x] # Again we compute the producer per x coordinate of the # consumer. This places producer code inside both the # pure and the update step of the producer. So there ends # up being two separate realizations of the producer, and # redundant work occurs. producer.compute_at(consumer, x) halide_result = consumer.realize(10) # The equivalent Python is: py_result = np.empty((10), dtype=np.int) # Pure step for the consumer for xx in range(10): # Pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 py_result[xx] = producer_storage[0] * xx # Update step for the consumer for xx in range(10): # Another copy of the pure step for producer producer_storage = np.empty((1), dtype=np.int) producer_storage[0] = xx * 17 py_result[xx] += producer_storage[0] # Check the results match for xx in range(10): assert halide_result[xx] == py_result[xx], \ "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx]) if True: # Case 4: The consumer references the producer in # multiple steps that do not share common variables producer, consumer = hl.Func("producer"), hl.Func("consumer") producer[x, y] = x * y consumer[x, y] = x + y consumer[x, 0] = producer[x, x - 1] consumer[0, y] = producer[y, y - 1] # In this case neither producer.compute_at(consumer, x) # nor producer.compute_at(consumer, y) will work, because # either one fails to cover one of the uses of the # producer. So we'd have to inline producer, or use # producer.compute_root(). # Let's say we really really want producer to be # compute_at the inner loops of both consumer update # steps. Halide doesn't allow multiple different # schedules for a single hl.Func, but we can work around it # by making two wrappers around producer, and scheduling # those instead: # Attempt 2: producer_wrapper_1, producer_wrapper_2, consumer_2 = hl.Func(), hl.Func(), hl.Func() producer_wrapper_1[x, y] = producer[x, y] producer_wrapper_2[x, y] = producer[x, y] consumer_2[x, y] = x + y consumer_2[x, 0] += producer_wrapper_1[x, x - 1] consumer_2[0, y] += producer_wrapper_2[y, y - 1] # The wrapper functions give us two separate handles on # the producer, so we can schedule them differently. producer_wrapper_1.compute_at(consumer_2, x) producer_wrapper_2.compute_at(consumer_2, y) halide_result = consumer_2.realize(10, 10) # The equivalent Python is: py_result = np.empty((10, 10), dtype=np.int) # Pure step for the consumer for yy in range(10): for xx in range(10): py_result[yy][xx] = xx + yy # First update step for consumer for xx in range(10): producer_wrapper_1_storage = np.empty((1), dtype=np.int) producer_wrapper_1_storage[0] = xx * (xx - 1) py_result[0][xx] += producer_wrapper_1_storage[0] # Second update step for consumer for yy in range(10): producer_wrapper_2_storage = np.empty((1), dtype=np.int) producer_wrapper_2_storage[0] = yy * (yy - 1) py_result[yy][0] += producer_wrapper_2_storage[0] # Check the results match for yy in range(10): for xx in range(10): assert halide_result[xx, yy] == py_result[yy][xx], \ "halide_result(%d, %d) = %d instead of %d" % ( xx, yy, halide_result[xx, yy], py_result[yy][xx]) if True: # Case 5: Scheduling a producer under a reduction domain # variable of the consumer. # We are not just restricted to scheduling producers at # the loops over the pure variables of the consumer. If a # producer is only used within a loop over a reduction # domain (hl.RDom) variable, we can also schedule the # producer there. producer, consumer = hl.Func("producer"), hl.Func("consumer") r = hl.RDom([(0, 5)]) producer[x] = x * 17 consumer[x] = x + 10 consumer[x] += r + producer[x + r] producer.compute_at(consumer, r) halide_result = consumer.realize(10) # The equivalent Python is: py_result = np.empty((10), dtype=np.int) # Pure step for the consumer. for xx in range(10): py_result[xx] = xx + 10 # Update step for the consumer. for xx in range(10): # The loop over the reduction domain is always the inner loop. for rr in range(5): # We've schedule the storage and computation of # the producer here. We just need a single value. producer_storage = np.empty((1), dtype=np.int) # Pure step of the producer. producer_storage[0] = (xx + rr) * 17 # Now use it in the update step of the consumer. py_result[xx] += rr + producer_storage[0] # Check the results match for xx in range(10): assert halide_result[xx] == py_result[xx], \ "halide_result(%d) = %d instead of %d" % (xx, halide_result[xx], py_result[xx]) # A real-world example of a reduction inside a producer-consumer chain. if True: # The default schedule for a reduction is a good one for # convolution-like operations. For example, the following # computes a 5x5 box-blur of our grayscale test image with a # hl.clamp-to-edge boundary condition: # First add the boundary condition. clamped = hl.BoundaryConditions.repeat_edge(input) # Define a 5x5 box that starts at (-2, -2) r = hl.RDom([(-2, 5), (-2, 5)]) # Compute the 5x5 sum around each pixel. local_sum = hl.Func("local_sum") local_sum[x, y] = 0 # Compute the sum as a 32-bit integer local_sum[x, y] += clamped[x + r.x, y + r.y] # Divide the sum by 25 to make it an average blurry = hl.Func("blurry") blurry[x, y] = hl.cast(hl.UInt(8), local_sum[x, y] / 25) halide_result = blurry.realize(input.width(), input.height()) # The default schedule will inline 'clamped' into the update # step of 'local_sum', because clamped only has a pure # definition, and so its default schedule is fully-inlined. # We will then compute local_sum per x coordinate of blurry, # because the default schedule for reductions is # compute-innermost. Here's the equivalent Python: #cast_to_uint8 = lambda x_: np.array([x_], dtype=np.uint8)[0] local_sum = np.empty((1), dtype=np.int32) py_result = hl.Buffer(hl.UInt(8), [input.width(), input.height()]) for yy in range(input.height()): for xx in range(input.width()): # FIXME this loop is quite slow # Pure step of local_sum local_sum[0] = 0 # Update step of local_sum for r_y in range(-2, 2 + 1): for r_x in range(-2, 2 + 1): # The clamping has been inlined into the update step. clamped_x = min(max(xx + r_x, 0), input.width() - 1) clamped_y = min(max(yy + r_y, 0), input.height() - 1) local_sum[0] += input[clamped_x, clamped_y] # Pure step of blurry # py_result(x, y) = (uint8_t)(local_sum[0] / 25) #py_result[xx, yy] = cast_to_uint8(local_sum[0] / 25) # hl.cast done internally py_result[xx, yy] = int(local_sum[0] / 25) # Check the results match for yy in range(input.height()): for xx in range(input.width()): assert halide_result[xx, yy] == py_result[xx, yy], \ "halide_result(%d, %d) = %d instead of %d" % ( xx, yy, halide_result[xx, yy], py_result[xx, yy]) # Reduction helpers. if True: # There are several reduction helper functions provided in # Halide.h, which compute small reductions and schedule them # innermost into their consumer. The most useful one is # "sum". f1 = hl.Func("f1") r = hl.RDom([(0, 100)]) f1[x] = hl.sum(r + x) * 7 # Sum creates a small anonymous hl.Func to do the reduction. It's # equivalent to: f2, anon = hl.Func("f2"), hl.Func("anon") anon[x] = 0 anon[x] += r + x f2[x] = anon[x] * 7 # So even though f1 references a reduction domain, it is a # pure function. The reduction domain has been swallowed to # define the inner anonymous reduction. halide_result_1 = f1.realize(10) halide_result_2 = f2.realize(10) # The equivalent Python is: py_result = np.empty((10), dtype=np.int) for xx in range(10): anon = np.empty((1), dtype=np.int) anon[0] = 0 for rr in range(100): anon[0] += rr + xx py_result[xx] = anon[0] * 7 # Check they all match. for xx in range(10): assert halide_result_1[xx] == py_result[xx], \ "halide_result_1(%d) = %d instead of %d" % (xx, halide_result_1[xx], py_result[xx]) assert halide_result_2[xx] == py_result[xx], \ "halide_result_2(%d) = %d instead of %d" % (xx, halide_result_2[xx], py_result[xx]) print("Success!") return 0
def bilateral_filter(input, width, height): print(' bilateral_filter') k = hl.Buffer(hl.Float(32), [7, 7], "gauss_kernel") k.translate([-3, -3]) weights = hl.Func("bilateral_weights") total_weights = hl.Func("bilateral_total_weights") bilateral = hl.Func("bilateral") output = hl.Func("bilateral_filter_output") x, y, dx, dy, c = hl.Var("x"), hl.Var("y"), hl.Var("dx"), hl.Var("dy"), hl.Var("c") rdom = hl.RDom([(-3, 7), (-3, 7)]) k.fill(0) k[-3, -3] = 0.000690 k[-2, -3] = 0.002646 k[-1, -3] = 0.005923 k[0, -3] = 0.007748 k[1, -3] = 0.005923 k[2, -3] = 0.002646 k[3, -3] = 0.000690 k[-3, -2] = 0.002646 k[-2, -2] = 0.010149 k[-1, -2] = 0.022718 k[0, -2] = 0.029715 k[1, -2] = 0.022718 k[2, -2] = 0.010149 k[3, -2] = 0.002646 k[-3, -1] = 0.005923 k[-2, -1] = 0.022718 k[-1, -1] = 0.050855 k[0, -1] = 0.066517 k[1, -1] = 0.050855 k[2, -1] = 0.022718 k[3, -1] = 0.005923 k[-3, 0] = 0.007748 k[-2, 0] = 0.029715 k[-1, 0] = 0.066517 k[0, 0] = 0.087001 k[1, 0] = 0.066517 k[2, 0] = 0.029715 k[3, 0] = 0.007748 k[-3, 1] = 0.005923 k[-2, 1] = 0.022718 k[-1, 1] = 0.050855 k[0, 1] = 0.066517 k[1, 1] = 0.050855 k[2, 1] = 0.022718 k[3, 1] = 0.005923 k[-3, 2] = 0.002646 k[-2, 2] = 0.010149 k[-1, 2] = 0.022718 k[0, 2] = 0.029715 k[1, 2] = 0.022718 k[2, 2] = 0.010149 k[3, 2] = 0.002646 k[-3, 3] = 0.000690 k[-2, 3] = 0.002646 k[-1, 3] = 0.005923 k[0, 3] = 0.007748 k[1, 3] = 0.005923 k[2, 3] = 0.002646 k[3, 3] = 0.000690 input_mirror = hl.BoundaryConditions.mirror_interior(input, [(0, width), (0, height)]) dist = hl.cast(hl.Float(32), hl.cast(hl.Int(32), input_mirror[x, y, c]) - hl.cast(hl.Int(32), input_mirror[x + dx, y + dy, c])) sig2 = 100 threshold = 25000 score = hl.select(hl.abs(input_mirror[x + dx, y + dy, c]) > threshold, 0, hl.exp(-dist * dist / sig2)) weights[dx, dy, x, y, c] = k[dx, dy] * score total_weights[x, y, c] = hl.sum(weights[rdom.x, rdom.y, x, y, c]) bilateral[x, y, c] = hl.sum(input_mirror[x + rdom.x, y + rdom.y, c] * weights[rdom.x, rdom.y, x, y, c]) / \ total_weights[x, y, c] output[x, y, c] = hl.cast(hl.Float(32), input[x, y, c]) output[x, y, 1] = bilateral[x, y, 1] output[x, y, 2] = bilateral[x, y, 2] weights.compute_at(output, y).vectorize(x, 16) output.compute_root().parallel(y).vectorize(x, 16) output.update(0).parallel(y).vectorize(x, 16) output.update(1).parallel(y).vectorize(x, 16) return output
def get_bilateral_grid(input, r_sigma, s_sigma): x = hl.Var('x') y = hl.Var('y') z = hl.Var('z') c = hl.Var('c') xi = hl.Var("xi") yi = hl.Var("yi") zi = hl.Var("zi") # Add a boundary condition clamped = hl.BoundaryConditions.repeat_edge(input) # Construct the bilateral grid r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r') val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2] val = hl.clamp(val, 0.0, 1.0) zi = hl.i32(val / r_sigma + 0.5) histogram = hl.Func('histogram') histogram[x, y, z, c] = 0.0 histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0) # Blur the histogram using a five-tap filter blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz') blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c] blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c] blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c] # Take trilinear samples to compute the output val = hl.clamp(clamped[x, y], 0.0, 1.0) zv = val / r_sigma zi = hl.i32(zv) zf = zv - zi xf = hl.f32(x % s_sigma) / s_sigma yf = hl.f32(y % s_sigma) / s_sigma xi = x / s_sigma yi = y / s_sigma interpolated = hl.Func('interpolated') interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf), hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf), hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf), hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf) # Normalize bilateral_grid = hl.Func('bilateral_grid') bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1] target = hl.get_target_from_environment() if target.has_gpu_feature(): # GPU schedule # Currently running this directly from the Python code is very slow. # Probably because of the dispatch time because generated code # is same speed as C++ generated code. print ("Compiling for GPU.") histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8); histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c) blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1) blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4) bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma) else: # CPU schedule print ("Compiling for CPU.") histogram.compute_root().parallel(z) histogram.update().reorder(c, r.x, r.y, x, y).unroll(c) blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c) blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c) bilateral_grid.compute_root().parallel(y).vectorize(x, 4) return bilateral_grid
def main(): # So far Funcs (such as the one below) have evaluated to a single # scalar value for each point in their domain. single_valued = hl.Func() x, y = hl.Var("x"), hl.Var("y") single_valued[x, y] = x + y # One way to write a hl.Func that returns a collection of values is # to add an additional dimension which indexes that # collection. This is how we typically deal with color. For # example, the hl.Func below represents a collection of three values # for every x, y coordinate indexed by c. color_image = hl.Func() c = hl.Var("c") color_image[x, y, c] = hl.select( c == 0, 245, # Red value c == 1, 42, # Green value 132) # Blue value # Since this pattern appears quite often, Halide provides a # syntatic sugar to write the code above as the following, # using the "mux" function. # color_image[x, y, c] = hl.mux(c, [245, 42, 132]); # This method is often convenient because it makes it easy to # operate on this hl.Func in a way that treats each item in the # collection equally: brighter = hl.Func() brighter[x, y, c] = color_image[x, y, c] + 10 # However this method is also inconvenient for three reasons. # # 1) Funcs are defined over an infinite domain, so users of this # hl.Func can for example access color_image(x, y, -17), which is # not a meaningful value and is probably indicative of a bug. # # 2) It requires a hl.select, which can impact performance if not # bounded and unrolled: # brighter.bound(c, 0, 3).unroll(c) # # 3) With this method, all values in the collection must have the # same type. While the above two issues are merely inconvenient, # this one is a hard limitation that makes it impossible to # express certain things in this way. # It is also possible to represent a collection of values as a # collection of Funcs: func_array = [hl.Func() for i in range(3)] func_array[0][x, y] = x + y func_array[1][x, y] = hl.sin(x) func_array[2][x, y] = hl.cos(y) # This method avoids the three problems above, but introduces a # new annoyance. Because these are separate Funcs, it is # difficult to schedule them so that they are all computed # together inside a single loop over x, y. # A third alternative is to define a hl.Func as evaluating to a # Tuple instead of an hl.Expr. A Tuple is a fixed-size collection of # Exprs which may have different type. The following function # evaluates to an integer value (x+y), and a floating point value # (hl.sin(x*y)). multi_valued = hl.Func("multi_valued") multi_valued[x, y] = (x + y, hl.sin(x * y)) # Realizing a tuple-valued hl.Func returns a collection of # Buffers. We call this a Realization. It's equivalent to a # std::vector of hl.Buffer/Image objects: if True: im1, im2 = multi_valued.realize([80, 60]) assert im1.type() == hl.Int(32) assert im2.type() == hl.Float(32) assert im1[30, 40] == 30 + 40 assert np.isclose(im2[30, 40], math.sin(30 * 40)) # You can also pass a tuple of pre-allocated buffers to realize() # rather than having new ones created. (The Buffers must have the correct # types and have identical sizes.) if True: im1, im2 = hl.Buffer(hl.Int(32), [80, 60]), hl.Buffer(hl.Float(32), [80, 60]) multi_valued.realize((im1, im2)) assert im1[30, 40] == 30 + 40 assert np.isclose(im2[30, 40], math.sin(30 * 40)) # All Tuple elements are evaluated together over the same domain # in the same loop nest, but stored in distinct allocations. The # equivalent C++ code to the above is: if True: multi_valued_0 = np.empty((80 * 60), dtype=np.int32) multi_valued_1 = np.empty((80 * 60), dtype=np.int32) for yy in range(80): for xx in range(60): multi_valued_0[xx + 60 * yy] = xx + yy multi_valued_1[xx + 60 * yy] = math.sin(xx * yy) # When compiling ahead-of-time, a Tuple-valued hl.Func evaluates # into multiple distinct output halide_buffer_t structs. These appear in # order at the end of the function signature: # int multi_valued(...input buffers and params..., halide_buffer_t # *output_1, halide_buffer_t *output_2) # You can construct a Tuple by passing multiple Exprs to the # Tuple constructor as we did above. Perhaps more elegantly, you # can also take advantage of initializer lists and just # enclose your Exprs in braces: multi_valued_2 = hl.Func("multi_valued_2") multi_valued_2[x, y] = (x + y, hl.sin(x * y)) # Calls to a multi-valued hl.Func cannot be treated as Exprs. The # following is a syntax error: # hl.Func consumer # consumer[x, y] = multi_valued_2[x, y] + 10 # Instead you must index the returned object with square brackets # to retrieve the individual Exprs: integer_part = multi_valued_2[x, y][0] floating_part = multi_valued_2[x, y][1] assert type(integer_part) is hl.FuncTupleElementRef assert type(floating_part) is hl.FuncTupleElementRef consumer = hl.Func() consumer[x, y] = (integer_part + 10, floating_part + 10.0) # Tuple reductions. if True: # Tuples are particularly useful in reductions, as they allow # the reduction to maintain complex state as it walks along # its domain. The simplest example is an argmax. # First we create an Image to take the argmax over. input_func = hl.Func() input_func[x] = hl.sin(x) input = input_func.realize([100]) assert input.type() == hl.Float(32) # Then we defined a 2-valued Tuple which tracks the maximum value # its index. arg_max = hl.Func() # Pure definition. # (using [()] for zero-dimensional Funcs is a convention of this python interface) arg_max[()] = (0, input[0]) # Update definition. r = hl.RDom([(1, 99)]) old_index = arg_max[()][0] old_max = arg_max[()][1] new_index = hl.select(old_max > input[r], r, old_index) new_max = hl.max(input[r], old_max) arg_max[()] = (new_index, new_max) # The equivalent C++ is: arg_max_0 = 0 arg_max_1 = float(input[0]) for r in range(1, 100): old_index = arg_max_0 old_max = arg_max_1 new_index = r if (old_max > input[r]) else old_index new_max = max(input[r], old_max) # In a tuple update definition, all loads and computation # are done before any stores, so that all Tuple elements # are updated atomically with respect to recursive calls # to the same hl.Func. arg_max_0 = new_index arg_max_1 = new_max # Let's verify that the Halide and C++ found the same maximum # value and index. if True: r0, r1 = arg_max.realize() assert r0.type() == hl.Int(32) assert r1.type() == hl.Float(32) assert arg_max_0 == r0[()] assert np.isclose(arg_max_1, r1[()]) # Halide provides argmax and hl.argmin as built-in reductions # similar to sum, product, maximum, and minimum. They return # a Tuple consisting of the point in the reduction domain # corresponding to that value, and the value itself. In the # case of ties they return the first value found. We'll use # one of these in the following section. # Tuples for user-defined types. if True: # Tuples can also be a convenient way to represent compound # objects such as complex numbers. Defining an object that # can be converted to and from a Tuple is one way to extend # Halide's type system with user-defined types. class Complex: def __init__(self, r, i=None): if type(r) is float and type(i) is float: self.real = hl.Expr(r) self.imag = hl.Expr(i) elif i is not None: self.real = r self.imag = i else: self.real = r[0] self.imag = r[1] def as_tuple(self): "Convert to a Tuple" return (self.real, self.imag) def __add__(self, other): "Complex addition" return Complex(self.real + other.real, self.imag + other.imag) def __mul__(self, other): "Complex multiplication" return Complex(self.real * other.real - self.imag * other.imag, self.real * other.imag + self.imag * other.real) def __getitem__(self, idx): return (self.real, self.imag)[idx] def __len__(self): return 2 def magnitude(self): "Complex magnitude" return (self.real * self.real) + (self.imag * self.imag) # Other complex operators would go here. The above are # sufficient for this example. # Let's use the Complex struct to compute a Mandelbrot set. mandelbrot = hl.Func() # The initial complex value corresponding to an x, y coordinate # in our hl.Func. initial = Complex(x / 15.0 - 2.5, y / 6.0 - 2.0) # Pure definition. t = hl.Var("t") mandelbrot[x, y, t] = Complex(0.0, 0.0) # We'll use an update definition to take 12 steps. r = hl.RDom([(1, 12)]) current = Complex(mandelbrot[x, y, r - 1]) # The following line uses the complex multiplication and # addition we defined above. mandelbrot[x, y, r] = (Complex(current * current) + initial) # We'll use another tuple reduction to compute the iteration # number where the value first escapes a circle of radius 4. # This can be expressed as an hl.argmin of a boolean - we want # the index of the first time the given boolean expression is # false (we consider false to be less than true). The argmax # would return the index of the first time the expression is # true. escape_condition = Complex(mandelbrot[x, y, r]).magnitude() < 16.0 first_escape = hl.argmin(escape_condition) assert type(first_escape) is tuple # We only want the index, not the value, but hl.argmin returns # both, so we'll index the hl.argmin Tuple expression using # square brackets to get the hl.Expr representing the index. escape = hl.Func() escape[x, y] = first_escape[0] # Realize the pipeline and print the result as ascii art. result = escape.realize([61, 25]) assert result.type() == hl.Int(32) code = " .:-~*={&%#@" for yy in range(result.height()): for xx in range(result.width()): index = result[xx, yy] if index < len(code): print("%c" % code[index], end="") else: pass # is lesson 13 cpp version buggy ? print("") print("Success!") return 0
def demosaic(input, width, height): print(f'width: {width}, height: {height}') f0 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f0") f1 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f1") f2 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f2") f3 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f3") f0.translate([-2, -2]) f1.translate([-2, -2]) f2.translate([-2, -2]) f3.translate([-2, -2]) d0 = hl.Func("demosaic_0") d1 = hl.Func("demosaic_1") d2 = hl.Func("demosaic_2") d3 = hl.Func("demosaic_3") output = hl.Func("demosaic_output") x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c") rdom0 = hl.RDom([(-2, 5), (-2, 5)]) # rdom1 = hl.RDom([(0, width / 2), (0, height / 2)]) input_mirror = hl.BoundaryConditions.mirror_interior(input, [(0, width), (0, height)]) f0.fill(0) f1.fill(0) f2.fill(0) f3.fill(0) f0_sum = 8 f1_sum = 16 f2_sum = 16 f3_sum = 16 f0[0, -2] = -1 f0[0, -1] = 2 f0[-2, 0] = -1 f0[-1, 0] = 2 f0[0, 0] = 4 f0[1, 0] = 2 f0[2, 0] = -1 f0[0, 1] = 2 f0[0, 2] = -1 f1[0, -2] = 1 f1[-1, -1] = -2 f1[1, -1] = -2 f1[-2, 0] = -2 f1[-1, 0] = 8 f1[0, 0] = 10 f1[1, 0] = 8 f1[2, 0] = -2 f1[-1, 1] = -2 f1[1, 1] = -2 f1[0, 2] = 1 f2[0, -2] = -2 f2[-1, -1] = -2 f2[0, -1] = 8 f2[1, -1] = -2 f2[-2, 0] = 1 f2[0, 0] = 10 f2[2, 0] = 1 f2[-1, 1] = -2 f2[0, 1] = 8 f2[1, 1] = -2 f2[0, 2] = -2 f3[0, -2] = -3 f3[-1, -1] = 4 f3[1, -1] = 4 f3[-2, 0] = -3 f3[0, 0] = 12 f3[2, 0] = -3 f3[-1, 1] = 4 f3[1, 1] = 4 f3[0, 2] = -3 d0[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f0[rdom0.x, rdom0.y]) / f0_sum) d1[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f1[rdom0.x, rdom0.y]) / f1_sum) d2[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f2[rdom0.x, rdom0.y]) / f2_sum) d3[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f3[rdom0.x, rdom0.y]) / f3_sum) R_row = y % 2 == 0 B_row = y % 2 != 0 R_col = x % 2 == 0 B_col = x % 2 != 0 at_R = c == 0 at_G = c == 1 at_B = c == 2 output[x, y, c] = hl.select(at_R & R_row & B_col, d1[x, y], at_R & B_row & R_col, d2[x, y], at_R & B_row & B_col, d3[x, y], at_G & R_row & R_col, d0[x, y], at_G & B_row & B_col, d0[x, y], at_B & B_row & R_col, d1[x, y], at_B & R_row & B_col, d2[x, y], at_B & R_row & R_col, d3[x, y], input[x, y]) d0.compute_root().parallel(y).vectorize(x, 16) d1.compute_root().parallel(y).vectorize(x, 16) d2.compute_root().parallel(y).vectorize(x, 16) d3.compute_root().parallel(y).vectorize(x, 16) output.compute_root().parallel(y).align_bounds(x, 2).unroll(x, 2).align_bounds(y, 2).unroll(y, 2).vectorize(x, 16) return output