def gen_calc_planarity_inlined(code, block_size, args): ''' Optimised version by manually inlining all code. ''' filterbank = args['filterbank'] f = filterbank.filters[0] rows, cols = block_size frows, fcols = f.size() hfrow, hfcol = [x//2 for x in f.size()] out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols in_ptr = args['in_ptr'] if 'in_ptr' in args else 0 buffer_ptr = rows*cols*2 assert buffer_ptr != in_ptr assert buffer_ptr != out_ptr for filter_nr, f in enumerate(filterbank.filters): # convolution + abs for i in xrange(rows): for j in xrange(cols): with scoped_alloc(code, 1) as acc: # convolution yield Xor(acc, acc, acc) for x, y, coeff in f.coefficients: ii = i + y - hfrow jj = j + x - hfcol with scoped_alloc(code, 2) as (coeff_reg, v): yield Imm(coeff_reg, coeff) for instr in load_mem_value(code, in_ptr, (jj, ii), v, block_size): yield instr yield Mul(v, v, coeff_reg) yield Add(acc, acc, v) # take max with scoped_alloc(code, 1) as const0: yield Imm(const0, 0) yield Cmp(acc, const0) yield Neg(acc, acc, cond='LT') yield MemWImm(buffer_ptr+i*cols + j, acc) # gather for i in xrange(rows): for j in xrange(cols): with scoped_alloc(code, 1) as max_v: # local max yield Imm(max_v, -float('inf')) for ii in xrange(frows): for jj in xrange(fcols): if not f.mask[ii][jj]: continue # skip if not enabled iii = i + ii - hfrow jjj = j + jj - hfcol with scoped_alloc(code, 1) as v: for instr in load_mem_value(code, buffer_ptr, (jjj, iii), v, block_size): yield instr yield Cmp(v, max_v) yield Mov(max_v, v, cond='GT') # global max if filter_nr != 0: with scoped_alloc(code, 1) as old_v: yield MemRImm(old_v, out_ptr+i*cols+j) yield Cmp(old_v, max_v) yield Mov(max_v, old_v, cond='GT') yield MemWImm(out_ptr+i*cols + j, max_v)
def gen_gather_local_max(code, block_size, args): ''' Gather local maximum from mask code generation. ''' f = args['filter'] rows, cols = block_size frows, fcols = f.size() hfrow, hfcol = [x//2 for x in f.size()] out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols in_ptr = args['in_ptr'] if 'in_ptr' in args else 0 for i in xrange(rows): for j in xrange(cols): with scoped_alloc(code, 1) as max_v: yield Imm(max_v, -float('inf')) for ii in xrange(frows): for jj in xrange(fcols): if not f.mask[ii][jj]: continue # skip if not enabled iii = i + ii - hfrow jjj = j + jj - hfcol with scoped_alloc(code, 1) as v: for instr in load_mem_value(code, in_ptr, (jjj, iii), v, block_size): yield instr yield Cmp(v, max_v) yield Mov(max_v, v, cond='GT') yield MemWImm(out_ptr+i*cols + j, max_v)
def gen_apply_sparse_filter(code, block_size, args): ''' Apply sparse filter code generation. ''' f = args['filter'] rows, cols = block_size hfrow, hfcol = [x//2 for x in f.size()] out_ptr = args['out_ptr'] if 'out_ptr' in args else rows*cols in_ptr = args['in_ptr'] if 'in_ptr' in args else 0 for i in xrange(rows): for j in xrange(cols): with scoped_alloc(code, 1) as acc: yield Xor(acc, acc, acc) for x, y, coeff in f.coefficients: ii = i + y - hfrow jj = j + x - hfcol with scoped_alloc(code, 2) as (coeff_reg, v): yield Imm(coeff_reg, coeff) for instr in load_mem_value(code, in_ptr, (jj, ii), v, block_size): yield instr yield Mul(v, v, coeff_reg) yield Add(acc, acc, v) yield MemWImm(out_ptr+i*cols + j, acc)