Python select示例，halide.select Python示例

示例#1

0

显示文件

def desaturate_noise(input, width, height):
    print('    desaturate_noise')

    output = hl.Func("desaturate_noise_output")

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    input_mirror = hl.BoundaryConditions.mirror_image(input, [(0, width), (0, height)])

    blur = gauss_15x15(gauss_15x15(input_mirror, "desaturate_noise_blur1"), "desaturate_noise_blur_2")

    factor = 1.4

    threshold = 25000

    output[x, y, c] = input[x, y, c]

    output[x, y, 1] = hl.select((hl.abs(blur[x, y, 1]) / hl.abs(input[x, y, 1]) < factor) &
                                (hl.abs(input[x, y, 1]) < threshold) & (hl.abs(blur[x, y, 1]) < threshold),
                                0.7 * blur[x, y, 1] + 0.3 * input[x, y, 1], input[x, y, 1])

    output[x, y, 2] = hl.select((hl.abs(blur[x, y, 2]) / hl.abs(input[x, y, 2]) < factor) &
                                (hl.abs(input[x, y, 2]) < threshold) & (hl.abs(blur[x, y, 2]) < threshold),
                                0.7 * blur[x, y, 2] + 0.3 * input[x, y, 2], input[x, y, 2])

    output.compute_root().parallel(y).vectorize(x, 16)

    return output

示例#2

0

显示文件

文件： autoscheduler_error.py 项目： rheimbuch/halide_autoscheduler_error

def expand_layer(x, y, c, img):
        expanded = hl.Func('expanded')
        expanded[x, y, c] = hl.select(((x % 2 == 0) & (y % 2 == 0)), img[x // 2, y // 2, c], 0.0)
        blurred = gaussian(x, y, c, expanded)
        expanded2 = mkfunc("expand", img)
        expanded2[x,y,c] = blurred[x,y,c] * 4.0
        return expanded2

示例#3

0

显示文件

文件： basics.py 项目： darkbuck/Halide

def test_basics3():

    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma', 0.1) # Value needed if not generating an executable
    s_sigma = 8 # This is passed during code generation in the C++ version

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0, input.width()-1),
                          hl.clamp(y, 0, input.height()-1),0]

    # Construct the bilateral grid
    r = hl.RDom(0, s_sigma, 0, s_sigma, 'r')
    val = clamped[x * s_sigma + r.x - s_sigma//2, y * s_sigma + r.y - s_sigma//2]
    val = hl.clamp(val, 0.0, 1.0)
    #zi = hl.cast(hl.Int(32), val * (1.0/r_sigma) + 0.5)
    zi = hl.cast(hl.Int(32), (val / r_sigma) + 0.5)
    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0

    ss = hl.select(c == 0, val, 1.0)
    print("hl.select(c == 0, val, 1.0)", ss)
    left = histogram[x, y, zi, c]
    print("histogram[x, y, zi, c]", histogram[x, y, zi, c])
    print("histogram[x, y, zi, c]", left)
    left += 5
    print("histogram[x, y, zi, c] after += 5", left)
    left += ss

    return

示例#4

0

显示文件

def test_basics3():
    input = hl.ImageParam(hl.Float(32), 3, 'input')
    r_sigma = hl.Param(hl.Float(32), 'r_sigma',
                       0.1)  # Value needed if not generating an executable
    s_sigma = 8  # This is passed during code generation in the C++ version

    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')

    # Add a boundary condition
    clamped = hl.Func('clamped')
    clamped[x, y] = input[hl.clamp(x, 0,
                                   input.width() - 1),
                          hl.clamp(y, 0,
                                   input.height() - 1), 0]

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2,
                  y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)
    zi = hl.i32((val / r_sigma) + 0.5)
    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0

    ss = hl.select(c == 0, val, 1.0)
    left = histogram[x, y, zi, c]
    left += 5
    left += ss

示例#5

0

显示文件

def merge_temporal(images, alignment):
    weight = hl.Func("merge_temporal_weights")
    total_weight = hl.Func("merge_temporal_total_weights")
    output = hl.Func("merge_temporal_output")

    ix, iy, tx, ty, n = hl.Var('ix'), hl.Var('iy'), hl.Var('tx'), hl.Var('ty'), hl.Var('n')
    rdom0 = hl.RDom([(0, 16), (0, 16)])

    rdom1 = hl.RDom([(1, images.dim(2).extent() - 1)])

    imgs_mirror = hl.BoundaryConditions.mirror_interior(images, [(0, images.width()), (0, images.height())])

    layer = box_down2(imgs_mirror, "merge_layer")

    offset = Point(alignment[tx, ty, n]).clamp(Point(MINIMUM_OFFSET, MINIMUM_OFFSET),
                                               Point(MAXIMUM_OFFSET, MAXIMUM_OFFSET))

    al_x = idx_layer(tx, rdom0.x) + offset.x / 2
    al_y = idx_layer(ty, rdom0.y) + offset.y / 2

    ref_val = layer[idx_layer(tx, rdom0.x), idx_layer(ty, rdom0.y), 0]
    alt_val = layer[al_x, al_y, n]

    factor = 8.0
    min_distance = 10
    max_distance = 300 # max L1 distance, otherwise the value is not used

    distance = hl.sum(hl.abs(hl.cast(hl.Int(32), ref_val) - hl.cast(hl.Int(32), alt_val))) / 256

    normal_distance = hl.max(1, hl.cast(hl.Int(32), distance) / factor - min_distance / factor)

    # Weight for the alternate frame
    weight[tx, ty, n] = hl.select(normal_distance > (max_distance - min_distance), 0.0,
                                  1.0 / normal_distance)

    total_weight[tx, ty] = hl.sum(weight[tx, ty, rdom1]) + 1

    offset = Point(alignment[tx, ty, rdom1])

    al_x = idx_im(tx, ix) + offset.x
    al_y = idx_im(ty, iy) + offset.y

    ref_val = imgs_mirror[idx_im(tx, ix), idx_im(ty, iy), 0]
    alt_val = imgs_mirror[al_x, al_y, rdom1]

    # Sum all values according to their weight, and divide by total weight to obtain average
    output[ix, iy, tx, ty] = hl.sum(weight[tx, ty, rdom1] * alt_val / total_weight[tx, ty]) + ref_val / total_weight[
        tx, ty]

    weight.compute_root().parallel(ty).vectorize(tx, 16)

    total_weight.compute_root().parallel(ty).vectorize(tx, 16)

    output.compute_root().parallel(ty).vectorize(ix, 32)

    return output

示例#6

0

显示文件

def test_minmax():
    x = hl.Var()
    f = hl.Func()
    f[x] = hl.select(x == 0, hl.min(x, 1), (x == 2) | (x == 4),
                     i32(hl.min(f32(x), 3.2, x * 2.1)), x == 3,
                     hl.max(x, x * 3, 1, x * 4), x)
    b = f.realize(5)
    assert b[0] == 0
    assert b[1] == 1, b[1]
    assert b[2] == 2
    assert b[3] == 12
    assert b[4] == 3

示例#7

0

显示文件

def shift_bayer_to_rggb(input, cfa_pattern):
    print(f'cfa_pattern: {cfa_pattern}')
    output = hl.Func("rggb_input")
    x, y = hl.Var("x"), hl.Var("y")

    cfa = hl.u16(cfa_pattern)

    output[x, y] = hl.select(cfa == hl.u16(1), input[x, y],
                             cfa == hl.u16(2), input[x + 1, y],
                             cfa == hl.u16(4), input[x, y + 1],
                             cfa == hl.u16(3), input[x + 1, y + 1],
                             0)
    return output

示例#8

0

显示文件

文件： multipass_constraints.py 项目： adityaatluri/Halide

def test_multipass_constraints():
    input = hl.ImageParam(hl.Float(32), 2, "input")

    f = hl.Func("f")
    x = hl.Var("x")
    y = hl.Var("y")

    f[x, y] = input[x+1, y+1] + input[x-1, y-1]
    f[x, y] += 3.0
    f.update().vectorize(x, 4)

    o = f.output_buffer()

    # Now make some hard-to-resolve constraints
    input.dim(0).set_bounds(
        min = input.dim(1).min() - 5,
        extent = input.dim(1).extent() + o.dim(0).extent()
    )

    o.dim(0).set_bounds(min = 0, 
                        extent = hl.select(o.dim(0).extent() < 22, 
                                           o.dim(0).extent() + 1, 
                                           o.dim(0).extent()))

    # Make a bounds query buffer
    query_buf = hl.Buffer.make_bounds_query(type = hl.Float(32), sizes = [7, 8])
    query_buf.set_min([2, 2])

    f.infer_input_bounds(query_buf)

    if  input.get().dim(0).min() != -4 or \
        input.get().dim(0).extent() != 34 or \
        input.get().dim(1).min() != 1 or \
        input.get().dim(1).extent() != 10 or \
        query_buf.dim(0).min() != 0 or \
        query_buf.dim(0).extent() != 24 or \
        query_buf.dim(1).min() != 2 or \
        query_buf.dim(1).extent() != 8:

        print("Constraints not correctly satisfied:\n",
               "in:",
               input.get().dim(0).min(), 
               input.get().dim(0).extent(),
               input.get().dim(1).min(), 
               input.get().dim(1).extent(),
               "out:",
               query_buf.dim(0).min(), 
               query_buf.dim(0).extent(),
               query_buf.dim(1).min(), 
               query_buf.dim(1).extent())
        assert False

示例#9

0

显示文件

def gamma_inverse(input):
    output = hl.Func("gamma_inverse_output")

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    cutoff = 2575
    gamma_toe = 0.0774
    gamma_pow = 2.4
    gamma_fac = 57632.49226
    gamma_con = 0.055

    if input.dimensions() == 2:
        output[x, y] = hl.u16(hl.select(input[x, y] < cutoff,
                                        gamma_toe * input[x, y],
                                        hl.pow(hl.f32(input[x, y]) / 65535 + gamma_con, gamma_pow) * gamma_fac))
    else:
        output[x, y, c] = hl.u16(hl.select(input[x, y, c] < cutoff,
                                           gamma_toe * input[x, y, c],
                                           hl.pow(hl.f32(input[x, y, c]) / 65535 + gamma_con, gamma_pow) * gamma_fac))

    output.compute_root().parallel(y).vectorize(x, 16)

    return output

示例#10

0

显示文件

def gamma_correct(input):
    output = hl.Func("gamma_correct_output")

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")

    cutoff = 200
    gamma_toe = 12.92
    gamma_pow = 0.416667
    gamma_fac = 680.552897
    gamma_con = -3604.425

    if input.dimensions() == 2:
        output[x, y] = hl.u16(hl.select(input[x, y] < cutoff,
                                        gamma_toe * input[x, y],
                                        gamma_fac * hl.pow(input[x, y], gamma_pow) + gamma_con))
    else:
        output[x, y, c] = hl.u16(hl.select(input[x, y, c] < cutoff,
                                           gamma_toe * input[x, y, c],
                                           gamma_fac * hl.pow(input[x, y, c], gamma_pow) + gamma_con))

    output.compute_root().parallel(y).vectorize(x, 16)

    return output

示例#11

0

显示文件

def test_multipass_constraints():
    input = hl.ImageParam(hl.Float(32), 2, "input")

    f = hl.Func("f")
    x = hl.Var("x")
    y = hl.Var("y")

    f[x, y] = input[x + 1, y + 1] + input[x - 1, y - 1]
    f[x, y] += 3.0
    f.update().vectorize(x, 4)

    o = f.output_buffer()

    # Now make some hard-to-resolve constraints
    input.dim(0).set_bounds(min=input.dim(1).min() - 5,
                            extent=input.dim(1).extent() + o.dim(0).extent())

    o.dim(0).set_bounds(min=0,
                        extent=hl.select(
                            o.dim(0).extent() < 22,
                            o.dim(0).extent() + 1,
                            o.dim(0).extent()))

    # Make a bounds query buffer
    query_buf = hl.Buffer.make_bounds_query(type=hl.Float(32), sizes=[7, 8])
    query_buf.set_min([2, 2])

    f.infer_input_bounds(query_buf)

    if  input.get().dim(0).min() != -4 or \
        input.get().dim(0).extent() != 34 or \
        input.get().dim(1).min() != 1 or \
        input.get().dim(1).extent() != 10 or \
        query_buf.dim(0).min() != 0 or \
        query_buf.dim(0).extent() != 24 or \
        query_buf.dim(1).min() != 2 or \
        query_buf.dim(1).extent() != 8:

        print("Constraints not correctly satisfied:\n", "in:",
              input.get().dim(0).min(),
              input.get().dim(0).extent(),
              input.get().dim(1).min(),
              input.get().dim(1).extent(), "out:",
              query_buf.dim(0).min(),
              query_buf.dim(0).extent(),
              query_buf.dim(1).min(),
              query_buf.dim(1).extent())
        assert False

示例#12

0

显示文件

def test_select():
    x = hl.Var()
    f = hl.Func()
    f[x] = hl.select(
        x == 0,
        31,
        x == 2,
        (x * 24),
        x == 2,
        999,  # should be ignored: first condition wins
        x)
    b = f.realize(4)
    assert b[0] == 31
    assert b[1] == 1
    assert b[2] == 48
    assert b[3] == 3

示例#13

0

显示文件

def main():

    # So far Funcs (such as the one below) have evaluated to a single
    # scalar value for each point in their domain.
    single_valued = hl.Func()
    x, y = hl.Var("x"), hl.Var("y")
    single_valued[x, y] = x + y

    # One way to write a hl.Func that returns a collection of values is
    # to add an additional dimension which indexes that
    # collection. This is how we typically deal with color. For
    # example, the hl.Func below represents a collection of three values
    # for every x, y coordinate indexed by c.
    color_image = hl.Func()
    c = hl.Var("c")
    color_image[x, y, c] = hl.select(
        c == 0,
        245,  # Red value
        c == 1,
        42,  # Green value
        132)  # Blue value

    # Since this pattern appears quite often, Halide provides a
    # syntatic sugar to write the code above as the following,
    # using the "mux" function.
    # color_image[x, y, c] = hl.mux(c, [245, 42, 132]);

    # This method is often convenient because it makes it easy to
    # operate on this hl.Func in a way that treats each item in the
    # collection equally:
    brighter = hl.Func()
    brighter[x, y, c] = color_image[x, y, c] + 10

    # However this method is also inconvenient for three reasons.
    #
    # 1) Funcs are defined over an infinite domain, so users of this
    # hl.Func can for example access color_image(x, y, -17), which is
    # not a meaningful value and is probably indicative of a bug.
    #
    # 2) It requires a hl.select, which can impact performance if not
    # bounded and unrolled:
    # brighter.bound(c, 0, 3).unroll(c)
    #
    # 3) With this method, all values in the collection must have the
    # same type. While the above two issues are merely inconvenient,
    # this one is a hard limitation that makes it impossible to
    # express certain things in this way.

    # It is also possible to represent a collection of values as a
    # collection of Funcs:
    func_array = [hl.Func() for i in range(3)]
    func_array[0][x, y] = x + y
    func_array[1][x, y] = hl.sin(x)
    func_array[2][x, y] = hl.cos(y)

    # This method avoids the three problems above, but introduces a
    # new annoyance. Because these are separate Funcs, it is
    # difficult to schedule them so that they are all computed
    # together inside a single loop over x, y.

    # A third alternative is to define a hl.Func as evaluating to a
    # Tuple instead of an hl.Expr. A Tuple is a fixed-size collection of
    # Exprs which may have different type. The following function
    # evaluates to an integer value (x+y), and a floating point value
    # (hl.sin(x*y)).
    multi_valued = hl.Func("multi_valued")
    multi_valued[x, y] = (x + y, hl.sin(x * y))

    # Realizing a tuple-valued hl.Func returns a collection of
    # Buffers. We call this a Realization. It's equivalent to a
    # std::vector of hl.Buffer/Image objects:
    if True:
        im1, im2 = multi_valued.realize([80, 60])
        assert im1.type() == hl.Int(32)
        assert im2.type() == hl.Float(32)
        assert im1[30, 40] == 30 + 40
        assert np.isclose(im2[30, 40], math.sin(30 * 40))

    # You can also pass a tuple of pre-allocated buffers to realize()
    # rather than having new ones created. (The Buffers must have the correct
    # types and have identical sizes.)
    if True:
        im1, im2 = hl.Buffer(hl.Int(32),
                             [80, 60]), hl.Buffer(hl.Float(32), [80, 60])
        multi_valued.realize((im1, im2))
        assert im1[30, 40] == 30 + 40
        assert np.isclose(im2[30, 40], math.sin(30 * 40))

    # All Tuple elements are evaluated together over the same domain
    # in the same loop nest, but stored in distinct allocations. The
    # equivalent C++ code to the above is:
    if True:
        multi_valued_0 = np.empty((80 * 60), dtype=np.int32)
        multi_valued_1 = np.empty((80 * 60), dtype=np.int32)

        for yy in range(80):
            for xx in range(60):
                multi_valued_0[xx + 60 * yy] = xx + yy
                multi_valued_1[xx + 60 * yy] = math.sin(xx * yy)

    # When compiling ahead-of-time, a Tuple-valued hl.Func evaluates
    # into multiple distinct output halide_buffer_t structs. These appear in
    # order at the end of the function signature:
    # int multi_valued(...input buffers and params..., halide_buffer_t
    # *output_1, halide_buffer_t *output_2)

    # You can construct a Tuple by passing multiple Exprs to the
    # Tuple constructor as we did above. Perhaps more elegantly, you
    # can also take advantage of initializer lists and just
    # enclose your Exprs in braces:
    multi_valued_2 = hl.Func("multi_valued_2")
    multi_valued_2[x, y] = (x + y, hl.sin(x * y))

    # Calls to a multi-valued hl.Func cannot be treated as Exprs. The
    # following is a syntax error:
    # hl.Func consumer
    # consumer[x, y] = multi_valued_2[x, y] + 10

    # Instead you must index the returned object with square brackets
    # to retrieve the individual Exprs:
    integer_part = multi_valued_2[x, y][0]
    floating_part = multi_valued_2[x, y][1]
    assert type(integer_part) is hl.FuncTupleElementRef
    assert type(floating_part) is hl.FuncTupleElementRef

    consumer = hl.Func()
    consumer[x, y] = (integer_part + 10, floating_part + 10.0)

    # Tuple reductions.
    if True:
        # Tuples are particularly useful in reductions, as they allow
        # the reduction to maintain complex state as it walks along
        # its domain. The simplest example is an argmax.

        # First we create an Image to take the argmax over.
        input_func = hl.Func()
        input_func[x] = hl.sin(x)
        input = input_func.realize([100])
        assert input.type() == hl.Float(32)

        # Then we defined a 2-valued Tuple which tracks the maximum value
        # its index.
        arg_max = hl.Func()

        # Pure definition.
        # (using [()] for zero-dimensional Funcs is a convention of this python interface)
        arg_max[()] = (0, input[0])

        # Update definition.
        r = hl.RDom([(1, 99)])
        old_index = arg_max[()][0]
        old_max = arg_max[()][1]
        new_index = hl.select(old_max > input[r], r, old_index)
        new_max = hl.max(input[r], old_max)
        arg_max[()] = (new_index, new_max)

        # The equivalent C++ is:
        arg_max_0 = 0
        arg_max_1 = float(input[0])
        for r in range(1, 100):
            old_index = arg_max_0
            old_max = arg_max_1
            new_index = r if (old_max > input[r]) else old_index
            new_max = max(input[r], old_max)
            # In a tuple update definition, all loads and computation
            # are done before any stores, so that all Tuple elements
            # are updated atomically with respect to recursive calls
            # to the same hl.Func.
            arg_max_0 = new_index
            arg_max_1 = new_max

        # Let's verify that the Halide and C++ found the same maximum
        # value and index.
        if True:
            r0, r1 = arg_max.realize()

            assert r0.type() == hl.Int(32)
            assert r1.type() == hl.Float(32)
            assert arg_max_0 == r0[()]
            assert np.isclose(arg_max_1, r1[()])

        # Halide provides argmax and hl.argmin as built-in reductions
        # similar to sum, product, maximum, and minimum. They return
        # a Tuple consisting of the point in the reduction domain
        # corresponding to that value, and the value itself. In the
        # case of ties they return the first value found. We'll use
        # one of these in the following section.

    # Tuples for user-defined types.
    if True:
        # Tuples can also be a convenient way to represent compound
        # objects such as complex numbers. Defining an object that
        # can be converted to and from a Tuple is one way to extend
        # Halide's type system with user-defined types.
        class Complex:
            def __init__(self, r, i=None):
                if type(r) is float and type(i) is float:
                    self.real = hl.Expr(r)
                    self.imag = hl.Expr(i)
                elif i is not None:
                    self.real = r
                    self.imag = i
                else:
                    self.real = r[0]
                    self.imag = r[1]

            def as_tuple(self):
                "Convert to a Tuple"
                return (self.real, self.imag)

            def __add__(self, other):
                "Complex addition"
                return Complex(self.real + other.real, self.imag + other.imag)

            def __mul__(self, other):
                "Complex multiplication"
                return Complex(self.real * other.real - self.imag * other.imag,
                               self.real * other.imag + self.imag * other.real)

            def __getitem__(self, idx):
                return (self.real, self.imag)[idx]

            def __len__(self):
                return 2

            def magnitude(self):
                "Complex magnitude"
                return (self.real * self.real) + (self.imag * self.imag)

            # Other complex operators would go here. The above are
            # sufficient for this example.

        # Let's use the Complex struct to compute a Mandelbrot set.
        mandelbrot = hl.Func()

        # The initial complex value corresponding to an x, y coordinate
        # in our hl.Func.
        initial = Complex(x / 15.0 - 2.5, y / 6.0 - 2.0)

        # Pure definition.
        t = hl.Var("t")
        mandelbrot[x, y, t] = Complex(0.0, 0.0)

        # We'll use an update definition to take 12 steps.
        r = hl.RDom([(1, 12)])
        current = Complex(mandelbrot[x, y, r - 1])

        # The following line uses the complex multiplication and
        # addition we defined above.
        mandelbrot[x, y, r] = (Complex(current * current) + initial)

        # We'll use another tuple reduction to compute the iteration
        # number where the value first escapes a circle of radius 4.
        # This can be expressed as an hl.argmin of a boolean - we want
        # the index of the first time the given boolean expression is
        # false (we consider false to be less than true).  The argmax
        # would return the index of the first time the expression is
        # true.

        escape_condition = Complex(mandelbrot[x, y, r]).magnitude() < 16.0
        first_escape = hl.argmin(escape_condition)
        assert type(first_escape) is tuple
        # We only want the index, not the value, but hl.argmin returns
        # both, so we'll index the hl.argmin Tuple expression using
        # square brackets to get the hl.Expr representing the index.
        escape = hl.Func()
        escape[x, y] = first_escape[0]

        # Realize the pipeline and print the result as ascii art.
        result = escape.realize([61, 25])
        assert result.type() == hl.Int(32)
        code = " .:-~*={&%#@"
        for yy in range(result.height()):
            for xx in range(result.width()):
                index = result[xx, yy]
                if index < len(code):
                    print("%c" % code[index], end="")
                else:
                    pass  # is lesson 13 cpp version buggy ?
            print("")

    print("Success!")

    return 0

示例#14

0

显示文件

def demosaic(input, width, height):
    print(f'width: {width}, height: {height}')

    f0 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f0")
    f1 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f1")
    f2 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f2")
    f3 = hl.Buffer(hl.Int(32), [5, 5], "demosaic_f3")

    f0.translate([-2, -2])
    f1.translate([-2, -2])
    f2.translate([-2, -2])
    f3.translate([-2, -2])

    d0 = hl.Func("demosaic_0")
    d1 = hl.Func("demosaic_1")
    d2 = hl.Func("demosaic_2")
    d3 = hl.Func("demosaic_3")

    output = hl.Func("demosaic_output")

    x, y, c = hl.Var("x"), hl.Var("y"), hl.Var("c")
    rdom0 = hl.RDom([(-2, 5), (-2, 5)])
    # rdom1 = hl.RDom([(0, width / 2), (0, height / 2)])

    input_mirror = hl.BoundaryConditions.mirror_interior(input, [(0, width), (0, height)])

    f0.fill(0)
    f1.fill(0)
    f2.fill(0)
    f3.fill(0)

    f0_sum = 8
    f1_sum = 16
    f2_sum = 16
    f3_sum = 16

    f0[0, -2] = -1
    f0[0, -1] = 2
    f0[-2, 0] = -1
    f0[-1, 0] = 2
    f0[0, 0] = 4
    f0[1, 0] = 2
    f0[2, 0] = -1
    f0[0, 1] = 2
    f0[0, 2] = -1

    f1[0, -2] = 1
    f1[-1, -1] = -2
    f1[1, -1] = -2
    f1[-2, 0] = -2
    f1[-1, 0] = 8
    f1[0, 0] = 10
    f1[1, 0] = 8
    f1[2, 0] = -2
    f1[-1, 1] = -2
    f1[1, 1] = -2
    f1[0, 2] = 1

    f2[0, -2] = -2
    f2[-1, -1] = -2
    f2[0, -1] = 8
    f2[1, -1] = -2
    f2[-2, 0] = 1
    f2[0, 0] = 10
    f2[2, 0] = 1
    f2[-1, 1] = -2
    f2[0, 1] = 8
    f2[1, 1] = -2
    f2[0, 2] = -2

    f3[0, -2] = -3
    f3[-1, -1] = 4
    f3[1, -1] = 4
    f3[-2, 0] = -3
    f3[0, 0] = 12
    f3[2, 0] = -3
    f3[-1, 1] = 4
    f3[1, 1] = 4
    f3[0, 2] = -3

    d0[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f0[rdom0.x, rdom0.y]) / f0_sum)
    d1[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f1[rdom0.x, rdom0.y]) / f1_sum)
    d2[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f2[rdom0.x, rdom0.y]) / f2_sum)
    d3[x, y] = hl.u16_sat(hl.sum(hl.i32(input_mirror[x + rdom0.x, y + rdom0.y]) * f3[rdom0.x, rdom0.y]) / f3_sum)

    R_row = y % 2 == 0
    B_row = y % 2 != 0
    R_col = x % 2 == 0
    B_col = x % 2 != 0
    at_R = c == 0
    at_G = c == 1
    at_B = c == 2

    output[x, y, c] = hl.select(at_R & R_row & B_col, d1[x, y],
                                at_R & B_row & R_col, d2[x, y],
                                at_R & B_row & B_col, d3[x, y],
                                at_G & R_row & R_col, d0[x, y],
                                at_G & B_row & B_col, d0[x, y],
                                at_B & B_row & R_col, d1[x, y],
                                at_B & R_row & B_col, d2[x, y],
                                at_B & R_row & R_col, d3[x, y],
                                input[x, y])

    d0.compute_root().parallel(y).vectorize(x, 16)
    d1.compute_root().parallel(y).vectorize(x, 16)
    d2.compute_root().parallel(y).vectorize(x, 16)
    d3.compute_root().parallel(y).vectorize(x, 16)

    output.compute_root().parallel(y).align_bounds(x, 2).unroll(x, 2).align_bounds(y, 2).unroll(y, 2).vectorize(x, 16)

    return output

示例#15

0

显示文件

    def gen_g(self):
        ''' define g() function '''
        # vars
        i, j, k, l = [self.vars[c] for c in "ijkl"]
        # clamped inputs
        x, y, z, expnt, fm, rnorm = [
            self.clamps[c] for c in ["x", "y", "z", "expnt", "fm", "rnorm"]
        ]
        # unclamped input (for sizing)
        fm_in = self.inputs["fm_in"]
        # scalar inputs
        delo2, delta, rdelta = [
            self.inputs[c] for c in ["delo2", "delta", "rdelta"]
        ]

        dx = hl.Func("dx")
        dy = hl.Func("dy")
        dz = hl.Func("dz")
        r2 = hl.Func("g_r2")
        expnt2 = hl.Func("expnt2")
        expnt_inv = hl.Func("expnt_inv")
        self.add_funcs_by_name([dx, dy, dz, r2, expnt2, expnt_inv])

        dx[i, j] = x[i] - x[j]
        dy[i, j] = y[i] - y[j]
        dz[i, j] = z[i] - z[j]

        r2[i,
           j] = dx[i, j] * dx[i, j] + dy[i, j] * dy[i, j] + dz[i, j] * dz[i, j]

        expnt2[i, j] = expnt[i] + expnt[j]
        expnt_inv[i, j] = hl.f64(1.0) / expnt2[i, j]

        fac2 = hl.Func("fac2")
        ex_arg = hl.Func("ex_arg")
        ex = hl.Func("ex")
        denom = hl.Func("denom")
        fac4d = hl.Func("fac4d")
        self.add_funcs_by_name([fac2, ex_arg, ex, denom, fac4d])
        fac2[i, j] = expnt[i] * expnt[j] * expnt_inv[i, j]
        ex_arg[i, j, k, l] = -fac2[i, j] * r2[i, j] - fac2[k, l] * r2[k, l]
        ex[i, j, k, l] = hl.select(ex_arg[i, j, k, l] < hl.f64(-37.0),
                                   hl.f64(0.0), hl.exp(ex_arg[i, j, k, l]))
        denom[i, j, k,
              l] = expnt2[i, j] * expnt2[k, l] * hl.sqrt(expnt2[i, j] +
                                                         expnt2[k, l])
        fac4d[i, j, k,
              l] = expnt2[i, j] * expnt2[k, l] / (expnt2[i, j] + expnt2[k, l])

        x2 = hl.Func("g_x2")
        y2 = hl.Func("g_y2")
        z2 = hl.Func("g_z2")
        rpq2 = hl.Func("rpq2")
        self.add_funcs_by_name([x2, y2, z2, rpq2])
        x2[i, j] = (x[i] * expnt[i] + x[j] * expnt[j]) * expnt_inv[i, j]
        y2[i, j] = (y[i] * expnt[i] + y[j] * expnt[j]) * expnt_inv[i, j]
        z2[i, j] = (z[i] * expnt[i] + z[j] * expnt[j]) * expnt_inv[i, j]
        rpq2[i, j, k, l] = ((x2[i, j] - x2[k, l]) * (x2[i, j] - x2[k, l]) +
                            (y2[i, j] - y2[k, l]) * (y2[i, j] - y2[k, l]) +
                            (z2[i, j] - z2[k, l]) * (z2[i, j] - z2[k, l]))

        f0t = hl.Func("f0t")
        f0n = hl.Func("f0n")
        f0x = hl.Func("f0x")
        f0val = hl.Func("f0val")
        self.add_funcs_by_name([f0t, f0n, f0x, f0val])
        f0t[i, j, k, l] = fac4d[i, j, k, l] * rpq2[i, j, k, l]
        f0n[i, j, k, l] = hl.clamp(hl.i32((f0t[i, j, k, l] + delo2) * rdelta),
                                   fm_in.dim(0).min(),
                                   fm_in.dim(0).max())
        f0x[i, j, k, l] = delta * f0n[i, j, k, l] - f0t[i, j, k, l]
        f0val[i, j, k, l] = hl.select(
            f0t[i, j, k, l] >= hl.f64(28.0),
            hl.f64(0.88622692545276) / hl.sqrt(f0t[i, j, k, l]),
            fm[f0n[i, j, k, l], 0] + f0x[i, j, k, l] *
            (fm[f0n[i, j, k, l], 1] + f0x[i, j, k, l] * hl.f64(0.5) *
             (fm[f0n[i, j, k, l], 2] + f0x[i, j, k, l] * hl.f64(1. / 3.) *
              (fm[f0n[i, j, k, l], 3] +
               f0x[i, j, k, l] * hl.f64(0.25) * fm[f0n[i, j, k, l], 4]))))

        g = hl.Func("g")
        self.add_funcs_by_name([g])

        if self.tracing and self.tracing_g:
            g_trace_in = hl.ImageParam(hl.Float(64), 4, "g_trace_in")
            g_trace = hl.BoundaryConditions.constant_exterior(g_trace_in, 0)
            self.inputs["g_trace_in"] = g_trace_in
            self.clamps["g_trace"] = g_trace
            g_trace.compute_root()
            g[i, j, k,
              l] = (hl.f64(2.00) * hl.f64(pow(pi, 2.50)) / denom[i, j, k, l]
                    ) * ex[i, j, k, l] * f0val[i, j, k, l] * rnorm[i] * rnorm[
                        j] * rnorm[k] * rnorm[l] + g_trace[i, j, k, l]
        else:
            g_trace = None
            g[i, j, k,
              l] = (hl.f64(2.00) * hl.f64(pow(pi, 2.50)) /
                    denom[i, j, k, l]) * ex[i, j, k, l] * f0val[
                        i, j, k, l] * rnorm[i] * rnorm[j] * rnorm[k] * rnorm[l]

示例#16

0

显示文件

def findStereoCorrespondence(left,
                             right,
                             SADWindowSize,
                             minDisparity,
                             numDisparities,
                             xmin,
                             xmax,
                             ymin,
                             ymax,
                             x_tile_size=32,
                             y_tile_size=32,
                             test=False,
                             uniquenessRatio=0.15,
                             disp12MaxDiff=1):
    """ Returns Func (left: Func, right: Func) """

    x, y, c, d = Var("x"), Var("y"), Var("c"), Var("d")

    diff = Func("diff")
    diff[d, x, y] = h.cast(UInt(16), h.abs(left[x, y] - right[x - d, y]))

    win2 = SADWindowSize / 2

    diff_T = Func("diff_T")
    xi, xo, yi, yo = Var("xi"), Var("xo"), Var("yi"), Var("yo")
    diff_T[d, xi, yi, xo, yo] = diff[d, xi + xo * x_tile_size + xmin,
                                     yi + yo * y_tile_size + ymin]

    cSAD, vsum = Func("cSAD"), Func("vsum")
    rk = RDom(-win2, SADWindowSize, "rk")
    rxi, ryi = RDom(1, x_tile_size - 1, "rxi"), RDom(1, y_tile_size - 1, "ryi")

    if test:
        vsum[d, xi, yi, xo, yo] = h.sum(diff_T[d, xi, yi + rk, xo, yo])
        cSAD[d, xi, yi, xo, yo] = h.sum(vsum[d, xi + rk, yi, xo, yo])
    else:
        vsum[d, xi, yi, xo, yo] = h.select(yi != 0, h.cast(UInt(16), 0),
                                           h.sum(diff_T[d, xi, rk, xo, yo]))
        vsum[d, xi, ryi, xo, yo] = vsum[d, xi, ryi - 1, xo, yo] + diff_T[
            d, xi, ryi + win2, xo, yo] - diff_T[d, xi, ryi - win2 - 1, xo, yo]

        cSAD[d, xi, yi, xo, yo] = h.select(xi != 0, h.cast(UInt(16), 0),
                                           h.sum(vsum[d, rk, yi, xo, yo]))
        cSAD[d, rxi, yi, xo,
             yo] = cSAD[d, rxi - 1, yi, xo,
                        yo] + vsum[d, rxi + win2, yi, xo,
                                   yo] - vsum[d, rxi - win2 - 1, yi, xo, yo]

    rd = RDom(minDisparity, numDisparities)
    disp_left = Func("disp_left")
    disp_left[xi, yi, xo, yo] = h.Tuple(h.cast(UInt(16), minDisparity),
                                        h.cast(UInt(16), (2 << 16) - 1))
    disp_left[xi, yi, xo, yo] = h.tuple_select(
        cSAD[rd, xi, yi, xo, yo] < disp_left[xi, yi, xo, yo][1],
        h.Tuple(h.cast(UInt(16), rd), cSAD[rd, xi, yi, xo, yo]),
        h.Tuple(disp_left[xi, yi, xo, yo]))

    FILTERED = -16
    disp = Func("disp")

    disp[x, y] = h.select(
        # x > xmax-xmin or y > ymax-ymin,
        x < xmax,
        h.cast(
            UInt(16), disp_left[x % x_tile_size, y % y_tile_size,
                                x / x_tile_size, y / y_tile_size][0]),
        h.cast(UInt(16), FILTERED))

    # Schedule
    vector_width = 8
    disp.compute_root() \
        .tile(x, y, xo, yo, xi, yi, x_tile_size, y_tile_size).reorder(xi, yi, xo, yo) \
        .vectorize(xi, vector_width).parallel(xo).parallel(yo)

    # reorder storage
    disp_left.reorder_storage(xi, yi, xo, yo)
    diff_T.reorder_storage(xi, yi, xo, yo, d)
    vsum.reorder_storage(xi, yi, xo, yo, d)
    cSAD.reorder_storage(xi, yi, xo, yo, d)

    disp_left.compute_at(disp, xo).reorder(xi, yi, xo, yo) \
                                  .vectorize(xi, vector_width) \
                                  .update() \
                                  .reorder(xi, yi, rd, xo, yo).vectorize(xi, vector_width)

    if test:
        cSAD.compute_at(disp_left, rd).reorder(xi, yi, xo, yo,
                                               d).vectorize(xi, vector_width)
        vsum.compute_at(disp_left, rd).reorder(xi, yi, xo, yo,
                                               d).vectorize(xi, vector_width)
    else:
        cSAD.compute_at(disp_left, rd).reorder(xi,  yi, xo, yo, d).vectorize(xi, vector_width) \
                                                                  .update() \
                                                                  .reorder(yi, rxi, xo, yo, d).vectorize(yi, vector_width)
        vsum.compute_at(disp_left, rd).reorder(xi,  yi, xo, yo, d).vectorize(xi, vector_width) \
                                                                  .update() \
                                                                  .reorder(xi, ryi, xo, yo, d).vectorize(xi, vector_width)

    return disp

示例#17

0

显示文件

文件： lesson_13_tuples.py 项目： darkbuck/Halide

def main():

    # So far Funcs (such as the one below) have evaluated to a single
    # scalar value for each point in their domain.
    single_valued = hl.Func()
    x, y = hl.Var("x"), hl.Var("y")
    single_valued[x, y] = x + y

    # One way to write a hl.Func that returns a collection of values is
    # to add an additional dimension which indexes that
    # collection. This is how we typically deal with color. For
    # example, the hl.Func below represents a collection of three values
    # for every x, y coordinate indexed by c.
    color_image = hl.Func()
    c = hl.Var("c")
    color_image[x, y, c] = hl.select(c == 0, 245, # Red value
                                  c == 1, 42,  # Green value
                                  132)        # Blue value

    # This method is often convenient because it makes it easy to
    # operate on this hl.Func in a way that treats each item in the
    # collection equally:
    brighter = hl.Func()
    brighter[x, y, c] = color_image[x, y, c] + 10

    # However this method is also inconvenient for three reasons.
    #
    # 1) Funcs are defined over an infinite domain, so users of this
    # hl.Func can for example access color_image(x, y, -17), which is
    # not a meaningful value and is probably indicative of a bug.
    #
    # 2) It requires a hl.select, which can impact performance if not
    # bounded and unrolled:
    # brighter.bound(c, 0, 3).unroll(c)
    #
    # 3) With this method, all values in the collection must have the
    # same type. While the above two issues are merely inconvenient,
    # this one is a hard limitation that makes it impossible to
    # express certain things in this way.

    # It is also possible to represent a collection of values as a
    # collection of Funcs:
    func_array = [hl.Func() for i in range(3)]
    func_array[0][x, y] = x + y
    func_array[1][x, y] = hl.sin(x)
    func_array[2][x, y] = hl.cos(y)

    # This method avoids the three problems above, but introduces a
    # new annoyance. Because these are separate Funcs, it is
    # difficult to schedule them so that they are all computed
    # together inside a single loop over x, y.

    # A third alternative is to define a hl.Func as evaluating to a
    # Tuple instead of an hl.Expr. A Tuple is a fixed-size collection of
    # Exprs which may have different type. The following function
    # evaluates to an integer value (x+y), and a floating point value
    # (hl.sin(x*y)).
    multi_valued = hl.Func("multi_valued")
    multi_valued[x, y] = (x + y, hl.sin(x * y))

    # Realizing a tuple-valued hl.Func returns a collection of
    # Buffers. We call this a Realization. It's equivalent to a
    # std::vector of hl.Buffer/Image objects:
    if True:
        (im1, im2) = multi_valued.realize(80, 60)
        assert type(im1) is hl.Buffer_int32
        assert type(im2) is hl.Buffer_float32
        assert im1(30, 40) == 30 + 40
        assert numpy.isclose(im2(30, 40), math.sin(30 * 40))


    # All Tuple elements are evaluated together over the same domain
    # in the same loop nest, but stored in distinct allocations. The
    # equivalent C++ code to the above is:
    if True:
        multi_valued_0 = numpy.empty((80*60), dtype=numpy.int32)
        multi_valued_1 = numpy.empty((80*60), dtype=numpy.int32)

        for yy in range(80):
            for xx in range(60):
                multi_valued_0[xx + 60*yy] = xx + yy
                multi_valued_1[xx + 60*yy] = math.sin(xx*yy)


    # When compiling ahead-of-time, a Tuple-valued hl.Func evaluates
    # into multiple distinct output buffer_t structs. These appear in
    # order at the end of the function signature:
    # int multi_valued(...input buffers and params..., buffer_t *output_1, buffer_t *output_2)

    # You can construct a Tuple by passing multiple Exprs to the
    # Tuple constructor as we did above. Perhaps more elegantly, you
    # can also take advantage of C++11 initializer lists and just
    # enclose your Exprs in braces:
    multi_valued_2 = hl.Func("multi_valued_2")
    multi_valued_2[x, y] = (x + y, hl.sin(x * y))

    # Calls to a multi-valued hl.Func cannot be treated as Exprs. The
    # following is a syntax error:
    # hl.Func consumer
    # consumer[x, y] = multi_valued_2[x, y] + 10

    # Instead you must index the returned object with square brackets
    # to retrieve the individual Exprs:
    integer_part = multi_valued_2[x, y][0]
    floating_part = multi_valued_2[x, y][1]
    assert type(integer_part) is hl.FuncTupleElementRef
    assert type(floating_part) is hl.FuncTupleElementRef

    consumer = hl.Func()
    consumer[x, y] = (integer_part + 10, floating_part + 10.0)

    # Tuple reductions.
    if True:
        # Tuples are particularly useful in reductions, as they allow
        # the reduction to maintain complex state as it walks along
        # its domain. The simplest example is an argmax.

        # First we create an Image to take the argmax over.
        input_func = hl.Func()
        input_func[x] = hl.sin(x)
        input = input_func.realize(100)
        assert type(input) is hl.Buffer_float32

        # Then we defined a 2-valued Tuple which tracks the maximum value
        # its index.
        arg_max = hl.Func()

        # Pure definition.
        # (using [()] for zero-dimensional Funcs is a convention of this python interface)
        arg_max[()] = (0, input(0))

        # Update definition.
        r = hl.RDom(1, 99)
        old_index = arg_max[()][0]
        old_max   = arg_max[()][1]
        new_index = hl.select(old_max > input[r], r, old_index)
        new_max   = hl.max(input[r], old_max)
        arg_max[()] = (new_index, new_max)

        # The equivalent C++ is:
        arg_max_0 = 0
        arg_max_1 = float(input(0))
        for r in range(1, 100):
            old_index = arg_max_0
            old_max = arg_max_1
            new_index = r if (old_max > input(r)) else old_index
            new_max = max(input(r), old_max)
            # In a tuple update definition, all loads and computation
            # are done before any stores, so that all Tuple elements
            # are updated atomically with respect to recursive calls
            # to the same hl.Func.
            arg_max_0 = new_index
            arg_max_1 = new_max


        # Let's verify that the Halide and C++ found the same maximum
        # value and index.
        if True:
            (r0, r1) = arg_max.realize()

            assert type(r0) is hl.Buffer_int32
            assert type(r1) is hl.Buffer_float32
            assert arg_max_0 == r0(0)
            assert numpy.isclose(arg_max_1, r1(0))


        # Halide provides argmax and hl.argmin as built-in reductions
        # similar to sum, product, maximum, and minimum. They return
        # a Tuple consisting of the point in the reduction domain
        # corresponding to that value, and the value itself. In the
        # case of ties they return the first value found. We'll use
        # one of these in the following section.


    # Tuples for user-defined types.
    if True:
        # Tuples can also be a convenient way to represent compound
        # objects such as complex numbers. Defining an object that
        # can be converted to and from a Tuple is one way to extend
        # Halide's type system with user-defined types.
        class Complex:

            def __init__(self, r, i=None):
                if type(r) is float and type(i) is float:
                    self.real = hl.Expr(r)
                    self.imag = hl.Expr(i)
                elif i is not None:
                    self.real = r
                    self.imag = i
                else:
                    self.real = r[0]
                    self.imag = r[1]

            def as_tuple(self):
                "Convert to a Tuple"
                return (self.real, self.imag)


            def __add__(self, other):
                "Complex addition"
                return Complex(self.real + other.real, self.imag + other.imag)


            def __mul__(self, other):
                "Complex multiplication"
                return Complex(self.real * other.real - self.imag * other.imag,
                               self.real * other.imag + self.imag * other.real)

            def __getitem__(self, idx):
                return (self.real, self.imag)[idx]

            def __len__(self):
                return 2

            def magnitude(self):
                "Complex magnitude"
                return (self.real * self.real) + (self.imag * self.imag)


            # Other complex operators would go here. The above are
            # sufficient for this example.


        # Let's use the Complex struct to compute a Mandelbrot set.
        mandelbrot = hl.Func()

        # The initial complex value corresponding to an x, y coordinate
        # in our hl.Func.
        initial = Complex(x/15.0 - 2.5, y/6.0 - 2.0)

        # Pure definition.
        t = hl.Var("t")
        mandelbrot[x, y, t] = Complex(0.0, 0.0)

        # We'll use an update definition to take 12 steps.
        r = hl.RDom(1, 12)
        current = Complex(mandelbrot[x, y, r-1])

        # The following line uses the complex multiplication and
        # addition we defined above.
        mandelbrot[x, y, r] = (Complex(current*current) + initial)

        # We'll use another tuple reduction to compute the iteration
        # number where the value first escapes a circle of radius 4.
        # This can be expressed as an hl.argmin of a boolean - we want
        # the index of the first time the given boolean expression is
        # false (we consider false to be less than true).  The argmax
        # would return the index of the first time the expression is
        # true.

        escape_condition = Complex(mandelbrot[x, y, r]).magnitude() < 16.0
        first_escape = hl.argmin(escape_condition)
        assert type(first_escape) is tuple
        # We only want the index, not the value, but hl.argmin returns
        # both, so we'll index the hl.argmin Tuple expression using
        # square brackets to get the hl.Expr representing the index.
        escape = hl.Func()
        escape[x, y] = first_escape[0]

        # Realize the pipeline and print the result as ascii art.
        result = escape.realize(61, 25)
        assert type(result) is hl.Buffer_int32
        code = " .:-~*={&%#@"
        for yy in range(result.height()):
            for xx in range(result.width()):
                index = result(xx, yy)
                if index < len(code):
                    print("%c" % code[index], end="")
                else:
                    pass # is lesson 13 cpp version buggy ?
            print("")


    print("Success!")

    return 0

示例#18

0

显示文件

文件： bilateral_grid.py 项目： adityaatluri/Halide

def get_bilateral_grid(input, r_sigma, s_sigma):
    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')
    xi = hl.Var("xi")
    yi = hl.Var("yi")
    zi = hl.Var("zi")

    # Add a boundary condition
    clamped = hl.BoundaryConditions.repeat_edge(input)

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)

    zi = hl.i32(val / r_sigma + 0.5)

    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0
    histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0)

    # Blur the histogram using a five-tap filter
    blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz')
    blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c]
    blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c]
    blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c]

    # Take trilinear samples to compute the output
    val = hl.clamp(clamped[x, y], 0.0, 1.0)
    zv = val / r_sigma
    zi = hl.i32(zv)
    zf = zv - zi
    xf = hl.f32(x % s_sigma) / s_sigma
    yf = hl.f32(y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = hl.Func('interpolated')
    interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf),
                                    hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf)

    # Normalize
    bilateral_grid = hl.Func('bilateral_grid')
    bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU schedule
        # Currently running this directly from the Python code is very slow.
        # Probably because of the dispatch time because generated code
        # is same speed as C++ generated code.
        print ("Compiling for GPU.")
        histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8);
        histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c)
        blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4)
        bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma)
    else:
        # CPU schedule
        print ("Compiling for CPU.")
        histogram.compute_root().parallel(z)
        histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
        blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c)
        blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        bilateral_grid.compute_root().parallel(y).vectorize(x, 4)

    return bilateral_grid

示例#19

0

显示文件

文件： bilateral_grid.py 项目： wsmoses/Halide-AS

def get_bilateral_grid(input, r_sigma, s_sigma):
    x = hl.Var('x')
    y = hl.Var('y')
    z = hl.Var('z')
    c = hl.Var('c')
    xi = hl.Var("xi")
    yi = hl.Var("yi")
    zi = hl.Var("zi")

    # Add a boundary condition
    clamped = hl.BoundaryConditions.repeat_edge(input)

    # Construct the bilateral grid
    r = hl.RDom([(0, s_sigma), (0, s_sigma)], 'r')
    val = clamped[x * s_sigma + r.x - s_sigma // 2, y * s_sigma + r.y - s_sigma // 2]
    val = hl.clamp(val, 0.0, 1.0)

    zi = hl.i32(val / r_sigma + 0.5)

    histogram = hl.Func('histogram')
    histogram[x, y, z, c] = 0.0
    histogram[x, y, zi, c] += hl.select(c == 0, val, 1.0)

    # Blur the histogram using a five-tap filter
    blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz')
    blurz[x, y, z, c] = histogram[x, y, z-2, c] + histogram[x, y, z-1, c]*4 + histogram[x, y, z, c]*6 + histogram[x, y, z+1, c]*4 + histogram[x, y, z+2, c]
    blurx[x, y, z, c] = blurz[x-2, y, z, c] + blurz[x-1, y, z, c]*4 + blurz[x, y, z, c]*6 + blurz[x+1, y, z, c]*4 + blurz[x+2, y, z, c]
    blury[x, y, z, c] = blurx[x, y-2, z, c] + blurx[x, y-1, z, c]*4 + blurx[x, y, z, c]*6 + blurx[x, y+1, z, c]*4 + blurx[x, y+2, z, c]

    # Take trilinear samples to compute the output
    val = hl.clamp(clamped[x, y], 0.0, 1.0)
    zv = val / r_sigma
    zi = hl.i32(zv)
    zf = zv - zi
    xf = hl.f32(x % s_sigma) / s_sigma
    yf = hl.f32(y % s_sigma) / s_sigma
    xi = x / s_sigma
    yi = y / s_sigma
    interpolated = hl.Func('interpolated')
    interpolated[x, y, c] = hl.lerp(hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi+1, yi, zi, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi, c], blury[xi+1, yi+1, zi, c], xf), yf),
                                    hl.lerp(hl.lerp(blury[xi, yi, zi+1, c], blury[xi+1, yi, zi+1, c], xf),
                                            hl.lerp(blury[xi, yi+1, zi+1, c], blury[xi+1, yi+1, zi+1, c], xf), yf), zf)

    # Normalize
    bilateral_grid = hl.Func('bilateral_grid')
    bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]

    target = hl.get_target_from_environment()
    if target.has_gpu_feature():
        # GPU schedule
        # Currently running this directly from the Python code is very slow.
        # Probably because of the dispatch time because generated code
        # is same speed as C++ generated code.
        print ("Compiling for GPU.")
        histogram.compute_root().reorder(c, z, x, y).gpu_tile(x, y, 8, 8);
        histogram.update().reorder(c, r.x, r.y, x, y).gpu_tile(x, y, xi, yi, 8, 8).unroll(c)
        blurx.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blury.compute_root().gpu_tile(x, y, z, xi, yi, zi, 16, 16, 1)
        blurz.compute_root().gpu_tile(x, y, z, xi, yi, zi, 8, 8, 4)
        bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, s_sigma, s_sigma)
    else:
        # CPU schedule
        print ("Compiling for CPU.")
        histogram.compute_root().parallel(z)
        histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
        blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 4).unroll(c)
        blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 4).unroll(c)
        bilateral_grid.compute_root().parallel(y).vectorize(x, 4)

    return bilateral_grid

示例#20

0

显示文件

def bilateral_filter(input, width, height):
    print('    bilateral_filter')

    k = hl.Buffer(hl.Float(32), [7, 7], "gauss_kernel")
    k.translate([-3, -3])

    weights = hl.Func("bilateral_weights")
    total_weights = hl.Func("bilateral_total_weights")
    bilateral = hl.Func("bilateral")
    output = hl.Func("bilateral_filter_output")

    x, y, dx, dy, c = hl.Var("x"), hl.Var("y"), hl.Var("dx"), hl.Var("dy"), hl.Var("c")
    rdom = hl.RDom([(-3, 7), (-3, 7)])

    k.fill(0)
    k[-3, -3] = 0.000690
    k[-2, -3] = 0.002646
    k[-1, -3] = 0.005923
    k[0, -3] = 0.007748
    k[1, -3] = 0.005923
    k[2, -3] = 0.002646
    k[3, -3] = 0.000690
    k[-3, -2] = 0.002646
    k[-2, -2] = 0.010149
    k[-1, -2] = 0.022718
    k[0, -2] = 0.029715
    k[1, -2] = 0.022718
    k[2, -2] = 0.010149
    k[3, -2] = 0.002646
    k[-3, -1] = 0.005923
    k[-2, -1] = 0.022718
    k[-1, -1] = 0.050855
    k[0, -1] = 0.066517
    k[1, -1] = 0.050855
    k[2, -1] = 0.022718
    k[3, -1] = 0.005923
    k[-3, 0] = 0.007748
    k[-2, 0] = 0.029715
    k[-1, 0] = 0.066517
    k[0, 0] = 0.087001
    k[1, 0] = 0.066517
    k[2, 0] = 0.029715
    k[3, 0] = 0.007748
    k[-3, 1] = 0.005923
    k[-2, 1] = 0.022718
    k[-1, 1] = 0.050855
    k[0, 1] = 0.066517
    k[1, 1] = 0.050855
    k[2, 1] = 0.022718
    k[3, 1] = 0.005923
    k[-3, 2] = 0.002646
    k[-2, 2] = 0.010149
    k[-1, 2] = 0.022718
    k[0, 2] = 0.029715
    k[1, 2] = 0.022718
    k[2, 2] = 0.010149
    k[3, 2] = 0.002646
    k[-3, 3] = 0.000690
    k[-2, 3] = 0.002646
    k[-1, 3] = 0.005923
    k[0, 3] = 0.007748
    k[1, 3] = 0.005923
    k[2, 3] = 0.002646
    k[3, 3] = 0.000690

    input_mirror = hl.BoundaryConditions.mirror_interior(input, [(0, width), (0, height)])

    dist = hl.cast(hl.Float(32),
                   hl.cast(hl.Int(32), input_mirror[x, y, c]) - hl.cast(hl.Int(32), input_mirror[x + dx, y + dy, c]))

    sig2 = 100

    threshold = 25000

    score = hl.select(hl.abs(input_mirror[x + dx, y + dy, c]) > threshold, 0, hl.exp(-dist * dist / sig2))

    weights[dx, dy, x, y, c] = k[dx, dy] * score

    total_weights[x, y, c] = hl.sum(weights[rdom.x, rdom.y, x, y, c])

    bilateral[x, y, c] = hl.sum(input_mirror[x + rdom.x, y + rdom.y, c] * weights[rdom.x, rdom.y, x, y, c]) / \
                         total_weights[x, y, c]

    output[x, y, c] = hl.cast(hl.Float(32), input[x, y, c])

    output[x, y, 1] = bilateral[x, y, 1]
    output[x, y, 2] = bilateral[x, y, 2]

    weights.compute_at(output, y).vectorize(x, 16)

    output.compute_root().parallel(y).vectorize(x, 16)

    output.update(0).parallel(y).vectorize(x, 16)
    output.update(1).parallel(y).vectorize(x, 16)

    return output

示例#21

0

显示文件

文件： stereobm.py 项目： spillai/halide-experiments

def findStereoCorrespondence(left, right, SADWindowSize, minDisparity, numDisparities,
                             xmin, xmax, ymin, ymax,
                             x_tile_size=32, y_tile_size=32, test=False, uniquenessRatio=0.15, disp12MaxDiff=1): 
    """ Returns Func (left: Func, right: Func) """

    x, y, c, d = Var("x"), Var("y"), Var("c"), Var("d")

    diff = Func("diff")
    diff[d, x, y] = h.cast(UInt(16), h.abs(left[x, y] - right[x-d, y]))

    win2 = SADWindowSize/2

    diff_T = Func("diff_T")
    xi, xo, yi, yo = Var("xi"), Var("xo"), Var("yi"), Var("yo")
    diff_T[d, xi, yi, xo, yo] = diff[d, xi + xo * x_tile_size + xmin, yi + yo * y_tile_size + ymin]

    cSAD, vsum = Func("cSAD"), Func("vsum")
    rk = RDom(-win2, SADWindowSize, "rk")
    rxi, ryi = RDom(1, x_tile_size - 1, "rxi"), RDom(1, y_tile_size - 1, "ryi")

    if test: 
        vsum[d, xi, yi, xo, yo] = h.sum(diff_T[d, xi, yi+rk, xo, yo])
        cSAD[d, xi, yi, xo, yo] = h.sum(vsum[d, xi+rk, yi, xo, yo])
    else: 
        vsum[d, xi, yi, xo, yo] = h.select(yi != 0, h.cast(UInt(16), 0), h.sum(diff_T[d, xi, rk, xo, yo]))
        vsum[d, xi, ryi, xo, yo] = vsum[d, xi, ryi-1, xo, yo] + diff_T[d, xi, ryi+win2, xo, yo] - diff_T[d, xi, ryi-win2-1, xo, yo]

        cSAD[d, xi, yi, xo, yo] = h.select(xi != 0, h.cast(UInt(16), 0), h.sum(vsum[d, rk, yi, xo, yo]))
        cSAD[d, rxi, yi, xo, yo] = cSAD[d, rxi-1, yi, xo, yo] + vsum[d, rxi+win2, yi, xo, yo] - vsum[d, rxi-win2-1, yi, xo, yo]

    rd = RDom(minDisparity, numDisparities)
    disp_left = Func("disp_left")
    disp_left[xi, yi, xo, yo] = h.Tuple(h.cast(UInt(16), minDisparity), h.cast(UInt(16), (2<<16)-1))
    disp_left[xi, yi, xo, yo] = h.tuple_select(
            cSAD[rd, xi, yi, xo, yo] < disp_left[xi, yi, xo, yo][1],
            h.Tuple(h.cast(UInt(16), rd), cSAD[rd, xi, yi, xo, yo]), 
            h.Tuple(disp_left[xi, yi, xo, yo]))

    FILTERED = -16
    disp = Func("disp")

    disp[x, y] = h.select(
        # x > xmax-xmin or y > ymax-ymin,
        x < xmax, 
        h.cast(UInt(16), disp_left[x % x_tile_size, y % y_tile_size, x / x_tile_size, y / y_tile_size][0]), 
        h.cast(UInt(16), FILTERED))
        

    # Schedule
    vector_width = 8
    disp.compute_root() \
        .tile(x, y, xo, yo, xi, yi, x_tile_size, y_tile_size).reorder(xi, yi, xo, yo) \
        .vectorize(xi, vector_width).parallel(xo).parallel(yo)

    # reorder storage
    disp_left.reorder_storage(xi, yi, xo, yo)
    diff_T   .reorder_storage(xi, yi, xo, yo, d)
    vsum     .reorder_storage(xi, yi, xo, yo, d)
    cSAD     .reorder_storage(xi, yi, xo, yo, d)

    disp_left.compute_at(disp, xo).reorder(xi, yi, xo, yo) \
                                  .vectorize(xi, vector_width) \
                                  .update() \
                                  .reorder(xi, yi, rd, xo, yo).vectorize(xi, vector_width)

    if test: 
        cSAD.compute_at(disp_left, rd).reorder(xi,  yi, xo, yo, d).vectorize(xi, vector_width)
        vsum.compute_at(disp_left, rd).reorder(xi,  yi, xo, yo, d).vectorize(xi, vector_width)
    else: 
        cSAD.compute_at(disp_left, rd).reorder(xi,  yi, xo, yo, d).vectorize(xi, vector_width) \
                                                                  .update() \
                                                                  .reorder(yi, rxi, xo, yo, d).vectorize(yi, vector_width)
        vsum.compute_at(disp_left, rd).reorder(xi,  yi, xo, yo, d).vectorize(xi, vector_width) \
                                                                  .update() \
                                                                  .reorder(xi, ryi, xo, yo, d).vectorize(xi, vector_width)
    
    return disp

示例#22

0

显示文件

文件： autoscheduler_error.py 项目： rheimbuch/halide_autoscheduler_error

def merge_laplacian(x, y, c, merged_energy, next_energy, prev_lap, next_lap):
    merged_lap = mkfunc('merged_lap', merged_energy, next_energy, next_lap, prev_lap)
    merged_lap[x,y,c] = hl.select(merged_energy[x,y] == next_energy[x,y],
                                    next_lap[x,y,c], prev_lap[x,y,c])
    return merged_lap