def _init_double_scan(self): """"generates a double scan on indexes and values in one operation""" arguments = "__global int *value", "__global int *index" int2 = pyopencl.tools.get_or_register_dtype("int2") input_expr = "index[i]>0 ? (int2)(0, 0) : (int2)(value[i], 1)" scan_expr = "a+b" neutral = "(int2)(0,0)" output_statement = "value[i] = item.s0; index[i+1] = item.s1;" if self.block_size > 256: knl = GenericScanKernel(self.ctx, dtype=int2, arguments=arguments, input_expr=input_expr, scan_expr=scan_expr, neutral=neutral, output_statement=output_statement) else: # MacOS on CPU knl = GenericDebugScanKernel(self.ctx, dtype=int2, arguments=arguments, input_expr=input_expr, scan_expr=scan_expr, neutral=neutral, output_statement=output_statement) return knl
def _init_compression_scan(self): """Initialize CBF compression scan kernels""" preamble = """ int compressed_size(int diff) { int abs_diff = abs(diff); if (abs_diff < 128) { return 1; } else if (abs_diff < 32768) { return 3; } else { return 7; } } void write(const int index, const int diff, global char *output) { int abs_diff = abs(diff); if (abs_diff < 128) { output[index] = (char) diff; } else if (abs_diff < 32768) { output[index] = -128; output[index + 1] = (char) (diff >> 0); output[index + 2] = (char) (diff >> 8); } else { output[index] = -128; output[index + 1] = 0; output[index + 2] = -128; output[index + 3] = (char) (diff >> 0); output[index + 4] = (char) (diff >> 8); output[index + 5] = (char) (diff >> 16); output[index + 6] = (char) (diff >> 24); } } """ arguments = "__global const int *data, __global char *compressed, __global int *size" input_expr = "compressed_size((i == 0) ? data[0] : (data[i] - data[i - 1]))" scan_expr = "a+b" neutral = "0" output_statement = """ if (prev_item == 0) { // 1st thread store compressed data size size[0] = last_item; } write(prev_item, (i == 0) ? data[0] : (data[i] - data[i - 1]), compressed); """ if self.block_size >= 64: knl = GenericScanKernel(self.ctx, dtype=numpy.int32, preamble=preamble, arguments=arguments, input_expr=input_expr, scan_expr=scan_expr, neutral=neutral, output_statement=output_statement) else: # MacOS on CPU knl = GenericDebugScanKernel(self.ctx, dtype=numpy.int32, preamble=preamble, arguments=arguments, input_expr=input_expr, scan_expr=scan_expr, neutral=neutral, output_statement=output_statement) return knl