def _compile_kernels(self): """Call the OpenCL compiler """ for kernel, wg_size in list(self.kernels.items()): kernel_src = get_opencl_code(kernel) if isinstance(wg_size, tuple): wg_size = self.max_workgroup_size try: program = pyopencl.Program(self.ctx, kernel_src).build("-D WORKGROUP_SIZE=%s" % wg_size) except pyopencl.MemoryError as error: raise MemoryError(error) except pyopencl.RuntimeError as error: if kernel == "keypoints_gpu2": logger.warning( "Failed compiling kernel '%s' with workgroup size %s: %s: use low_end alternative", kernel, wg_size, error, ) self.LOW_END += 1 elif kernel == "keypoints_gpu1": logger.warning( "Failed compiling kernel '%s' with workgroup size %s: %s: use CPU alternative", kernel, wg_size, error, ) self.LOW_END += 1 else: logger.error("Failed compiling kernel '%s' with workgroup size %s: %s", kernel, wg_size, error) raise error self.programs[kernel] = program for one_function in program.all_kernels(): workgroup_size = kernel_workgroup_size(program, one_function) self.kernels[kernel + "." + one_function.function_name] = workgroup_size
def test_interpolation(self): """ tests the keypoints interpolation kernel Requires the following: "self.keypoints1", "self.actual_nb_keypoints", "self.gpu_dog_prev", self.gpu_dog", "self.gpu_dog_next", "self.s", "self.width", "self.height", "self.peakthresh" """ # interpolation_setup : border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, nb_keypoints, actual_nb_keypoints, width, height, DOGS, s, keypoints_prev, blur = interpolation_setup() # actual_nb_keypoints is the number of keypoints returned by "local_maxmin". # After the interpolation, it will be reduced, but we can still use it as a boundary. maxwg = kernel_workgroup_size(self.program, "interp_keypoint") shape = calc_size((keypoints_prev.shape[0],), maxwg) gpu_dogs = pyopencl.array.to_device(self.queue, DOGS) gpu_keypoints1 = pyopencl.array.to_device(self.queue, keypoints_prev) # actual_nb_keypoints = numpy.int32(len((keypoints_prev[:,0])[keypoints_prev[:,1] != -1])) start_keypoints = numpy.int32(0) actual_nb_keypoints = numpy.int32(actual_nb_keypoints) InitSigma = numpy.float32(1.6) # warning: it must be the same in my_keypoints_interpolation t0 = time.time() k1 = self.program.interp_keypoint(self.queue, shape, (maxwg,), gpu_dogs.data, gpu_keypoints1.data, start_keypoints, actual_nb_keypoints, peakthresh, InitSigma, width, height) res = gpu_keypoints1.get() t1 = time.time() ref = numpy.copy(keypoints_prev) # important here for i, k in enumerate(ref[:nb_keypoints, :]): ref[i] = my_interp_keypoint(DOGS, s, k[1], k[2], 5, peakthresh, width, height) t2 = time.time() # we have to compare keypoints different from (-1,-1,-1,-1) res2 = res[res[:, 1] != -1] ref2 = ref[ref[:, 1] != -1] if (PRINT_KEYPOINTS): logger.info("[s=%s]Keypoints before interpolation: %s", s, actual_nb_keypoints) # logger.info(keypoints_prev[0:10,:] logger.info("[s=%s]Keypoints after interpolation : %s", s, res2.shape[0]) logger.info(res[0:actual_nb_keypoints]) # [0:10,:] # logger.info("Ref:") # logger.info(ref[0:32,:] # print(maxwg, self.maxwg, self.wg[0], self.wg[1]) if self.maxwg < self.wg[0] * self.wg[1]: logger.info("Not testing result as the WG is too little %s", self.maxwg) return self.assertLess(abs(len(ref2) - len(res2)) / (len(ref2) + len(res2)), 0.33, "the number of keypoint is almost the same") # print(ref2) # print(res2) delta = norm_L1(ref2, res2) self.assert_(delta < 0.43, "delta=%s" % (delta)) logger.info("delta=%s" % delta) if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Keypoints interpolation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def _compile_kernels(self): """Call the OpenCL compiler """ for kernel, wg_size in list(self.kernels.items()): kernel_src = get_opencl_code(os.path.join("sift", kernel)) if isinstance(wg_size, tuple): wg_size = self.max_workgroup_size try: program = pyopencl.Program(self.ctx, kernel_src).build( '-D WORKGROUP_SIZE=%s' % wg_size) except pyopencl.MemoryError as error: raise MemoryError(error) except pyopencl.RuntimeError as error: if kernel == "keypoints_gpu2": logger.warning( "Failed compiling kernel '%s' with workgroup size %s: %s: use low_end alternative", kernel, wg_size, error) self.LOW_END += 1 elif kernel == "keypoints_gpu1": logger.warning( "Failed compiling kernel '%s' with workgroup size %s: %s: use CPU alternative", kernel, wg_size, error) self.LOW_END += 1 else: logger.error( "Failed compiling kernel '%s' with workgroup size %s: %s", kernel, wg_size, error) raise error self.programs[kernel] = program for one_function in program.all_kernels(): workgroup_size = kernel_workgroup_size(program, one_function) self.kernels[kernel + "." + one_function.function_name] = workgroup_size
def test_interpolation(self): """ tests the keypoints interpolation kernel Requires the following: "self.keypoints1", "self.actual_nb_keypoints", "self.gpu_dog_prev", self.gpu_dog", "self.gpu_dog_next", "self.s", "self.width", "self.height", "self.peakthresh" """ # interpolation_setup : border_dist, peakthresh, EdgeThresh, EdgeThresh0, octsize, nb_keypoints, actual_nb_keypoints, width, height, DOGS, s, keypoints_prev, blur = interpolation_setup() # actual_nb_keypoints is the number of keypoints returned by "local_maxmin". # After the interpolation, it will be reduced, but we can still use it as a boundary. maxwg = kernel_workgroup_size(self.program, "interp_keypoint") shape = calc_size((keypoints_prev.shape[0],), maxwg) gpu_dogs = pyopencl.array.to_device(self.queue, DOGS) gpu_keypoints1 = pyopencl.array.to_device(self.queue, keypoints_prev) # actual_nb_keypoints = numpy.int32(len((keypoints_prev[:,0])[keypoints_prev[:,1] != -1])) start_keypoints = numpy.int32(0) actual_nb_keypoints = numpy.int32(actual_nb_keypoints) InitSigma = numpy.float32(1.6) # warning: it must be the same in my_keypoints_interpolation t0 = time.time() k1 = self.program.interp_keypoint(self.queue, shape, (maxwg,), gpu_dogs.data, gpu_keypoints1.data, start_keypoints, actual_nb_keypoints, peakthresh, InitSigma, width, height) res = gpu_keypoints1.get() t1 = time.time() ref = numpy.copy(keypoints_prev) # important here for i, k in enumerate(ref[:nb_keypoints, :]): ref[i] = my_interp_keypoint(DOGS, s, k[1], k[2], 5, peakthresh, width, height) t2 = time.time() # we have to compare keypoints different from (-1,-1,-1,-1) res2 = res[res[:, 1] != -1] ref2 = ref[ref[:, 1] != -1] if (PRINT_KEYPOINTS): logger.info("[s=%s]Keypoints before interpolation: %s", s, actual_nb_keypoints) # logger.info(keypoints_prev[0:10,:] logger.info("[s=%s]Keypoints after interpolation : %s", s, res2.shape[0]) logger.info(res[0:actual_nb_keypoints]) # [0:10,:] # logger.info("Ref:") # logger.info(ref[0:32,:] # print(maxwg, self.maxwg, self.wg[0], self.wg[1]) if self.maxwg < self.wg[0] * self.wg[1]: logger.info("Not testing result as the WG is too little %s", self.maxwg) return self.assertLess(abs(len(ref2) - len(res2)) / (len(ref2) + len(res2)), 0.33, "the number of keypoint is almost the same") # print(ref2) # print(res2) delta = norm_L1(ref2, res2) self.assertLess(delta, 0.43, "delta=%s" % (delta)) logger.info("delta=%s" % delta) if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Keypoints interpolation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_rgb(self): """ tests the int64 kernel """ max_wg = kernel_workgroup_size(self.reduction, "max_min_global_stage1") if max_wg < self.red_size: logger.warning( "test_uint16: Skipping test of WG=%s when maximum is %s (%s)", self.red_size, max_wg, self.max_wg) return lint = numpy.empty((self.input.shape[0], self.input.shape[1], 3), dtype=numpy.uint8) lint[:, :, 0] = self.input.astype(numpy.uint8) lint[:, :, 1] = self.input.astype(numpy.uint8) lint[:, :, 2] = self.input.astype(numpy.uint8) t0 = time.time() au8 = pyopencl.array.to_device(self.queue, lint) k1 = self.program.rgb_to_float(self.queue, self.shape, self.wg, au8.data, self.gpudata.data, self.IMAGE_W, self.IMAGE_H) k2 = self.reduction.max_min_global_stage1( self.queue, (self.red_size * self.red_size, ), (self.red_size, ), self.gpudata.data, self.buffers_max_min.data, (self.IMAGE_W * self.IMAGE_H)) k3 = self.reduction.max_min_global_stage2(self.queue, (self.red_size, ), (self.red_size, ), self.buffers_max_min.data, self.buffers_max.data, self.buffers_min.data) k4 = self.program.normalizes(self.queue, self.shape, self.wg, self.gpudata.data, self.buffers_min.data, self.buffers_max.data, self.twofivefive.data, self.IMAGE_W, self.IMAGE_H) res = self.gpudata.get() t1 = time.time() ref = normalize(lint.max(axis=-1)) t2 = time.time() delta = abs(ref - res).max() if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Conversion RGB ->float took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start))) logger.info("Reduction stage1 took %.3fms" % (1e-6 * (k2.profile.end - k2.profile.start))) logger.info("Reduction stage2 took %.3fms" % (1e-6 * (k3.profile.end - k3.profile.start))) logger.info("Normalization %.3fms" % (1e-6 * (k4.profile.end - k4.profile.start))) logger.info("--------------------------------------") self.assert_(delta < 1e-4, "delta=%s" % delta)
def test_uint8(self): """ tests the uint8 kernel """ max_wg = kernel_workgroup_size(self.reduction, "max_min_global_stage1") if max_wg < self.red_size: logger.warning( "test_uint8: Skipping test of WG=%s when maximum is %s (%s)", self.red_size, max_wg, self.max_wg) return lint = self.input.astype(numpy.uint8) t0 = time.time() au8 = pyopencl.array.to_device(self.queue, lint) k1 = self.program.u8_to_float(self.queue, self.shape, self.wg, au8.data, self.gpudata.data, self.IMAGE_W, self.IMAGE_H) # print abs(au8.get() - self.gpudata.get()).max() k2 = self.reduction.max_min_global_stage1( self.queue, (self.red_size * self.red_size, ), (self.red_size, ), self.gpudata.data, self.buffers_max_min.data, (self.IMAGE_W * self.IMAGE_H), pyopencl.LocalMemory(8 * self.red_size)) k3 = self.reduction.max_min_global_stage2( self.queue, (self.red_size, ), (self.red_size, ), self.buffers_max_min.data, self.buffers_max.data, self.buffers_min.data, pyopencl.LocalMemory(8 * self.red_size)) # print self.buffers_max.get(), self.buffers_min.get(), self.input.min(), self.input.max() k4 = self.program.normalizes(self.queue, self.shape, self.wg, self.gpudata.data, self.buffers_min.data, self.buffers_max.data, self.twofivefive.data, self.IMAGE_W, self.IMAGE_H) k4.wait() res = self.gpudata.get() t1 = time.time() ref = normalize(lint) t2 = time.time() delta = abs(ref - res).max() if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Conversion uint8->float took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start))) logger.info("Reduction stage1 took %.3fms" % (1e-6 * (k2.profile.end - k2.profile.start))) logger.info("Reduction stage2 took %.3fms" % (1e-6 * (k3.profile.end - k3.profile.start))) logger.info("Normalization %.3fms" % (1e-6 * (k4.profile.end - k4.profile.start))) logger.info("--------------------------------------") self.assertLess(delta, 1e-4, "delta=%s" % delta)
def _compile_kernels(self): """ Call the OpenCL compiler """ for kernel in list(self.kernels.keys()): kernel_src = get_opencl_code(kernel) try: program = pyopencl.Program(self.ctx, kernel_src).build() except pyopencl.MemoryError as error: raise MemoryError(error) self.program = program for one_function in program.all_kernels(): workgroup_size = kernel_workgroup_size(program, one_function) self.kernels[kernel+"."+one_function.function_name] = workgroup_size
def test_v2_even(self): """ test odd kernel size """ sigma = 3.0 size = 28 ref = gaussian_cpu(sigma, size) max_wg = kernel_workgroup_size(self.kernels["gaussian"], "gaussian") if max_wg < size: logger.warning("Skipping test of WG=%s when maximum is %s", size, max_wg) return res = self.gaussian_gpu_v2(sigma, size) delta = ref - res self.assert_(abs(delta).max() < 1e-6, "gaussian are the same ")
def _compile_kernels(self): """ Call the OpenCL compiler """ for kernel in list(self.kernels.keys()): kernel_src = get_opencl_code(os.path.join("sift", kernel)) try: program = pyopencl.Program(self.ctx, kernel_src).build() except pyopencl.MemoryError as error: raise MemoryError(error) self.program = program for one_function in program.all_kernels(): workgroup_size = kernel_workgroup_size(program, one_function) self.kernels[kernel + "." + one_function.function_name] = workgroup_size
def test_uint8(self): """ tests the uint8 kernel """ max_wg = kernel_workgroup_size(self.reduction, "max_min_global_stage1") if max_wg < self.red_size: logger.warning("test_uint8: Skipping test of WG=%s when maximum is %s (%s)", self.red_size, max_wg, self.max_wg) return lint = self.input.astype(numpy.uint8) t0 = time.time() au8 = pyopencl.array.to_device(self.queue, lint) k1 = self.program.u8_to_float(self.queue, self.shape, self.wg, au8.data, self.gpudata.data, self.IMAGE_W, self.IMAGE_H) # print abs(au8.get() - self.gpudata.get()).max() k2 = self.reduction.max_min_global_stage1(self.queue, (self.red_size * self.red_size,), (self.red_size,), self.gpudata.data, self.buffers_max_min.data, (self.IMAGE_W * self.IMAGE_H), pyopencl.LocalMemory(8 * self.red_size)) k3 = self.reduction.max_min_global_stage2(self.queue, (self.red_size,), (self.red_size,), self.buffers_max_min.data, self.buffers_max.data, self.buffers_min.data, pyopencl.LocalMemory(8 * self.red_size)) # print self.buffers_max.get(), self.buffers_min.get(), self.input.min(), self.input.max() k4 = self.program.normalizes(self.queue, self.shape, self.wg, self.gpudata.data, self.buffers_min.data, self.buffers_max.data, self.twofivefive.data, self.IMAGE_W, self.IMAGE_H) k4.wait() res = self.gpudata.get() t1 = time.time() ref = normalize(lint) t2 = time.time() delta = abs(ref - res).max() if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Conversion uint8->float took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start))) logger.info("Reduction stage1 took %.3fms" % (1e-6 * (k2.profile.end - k2.profile.start))) logger.info("Reduction stage2 took %.3fms" % (1e-6 * (k3.profile.end - k3.profile.start))) logger.info("Normalization %.3fms" % (1e-6 * (k4.profile.end - k4.profile.start))) logger.info("--------------------------------------") self.assertLess(delta, 1e-4, "delta=%s" % delta)
def test_rgb(self): """ tests the int64 kernel """ max_wg = kernel_workgroup_size(self.reduction, "max_min_global_stage1") if max_wg < self.red_size: logger.warning("test_uint16: Skipping test of WG=%s when maximum is %s (%s)", self.red_size, max_wg, self.max_wg) return lint = numpy.empty((self.input.shape[0], self.input.shape[1], 3), dtype=numpy.uint8) lint[:, :, 0] = self.input.astype(numpy.uint8) lint[:, :, 1] = self.input.astype(numpy.uint8) lint[:, :, 2] = self.input.astype(numpy.uint8) t0 = time.time() au8 = pyopencl.array.to_device(self.queue, lint) k1 = self.program.rgb_to_float(self.queue, self.shape, self.wg, au8.data, self.gpudata.data, self.IMAGE_W, self.IMAGE_H) k2 = self.reduction.max_min_global_stage1(self.queue, (self.red_size * self.red_size,), (self.red_size,), self.gpudata.data, self.buffers_max_min.data, (self.IMAGE_W * self.IMAGE_H)) k3 = self.reduction.max_min_global_stage2(self.queue, (self.red_size,), (self.red_size,), self.buffers_max_min.data, self.buffers_max.data, self.buffers_min.data) k4 = self.program.normalizes(self.queue, self.shape, self.wg, self.gpudata.data, self.buffers_min.data, self.buffers_max.data, self.twofivefive.data, self.IMAGE_W, self.IMAGE_H) res = self.gpudata.get() t1 = time.time() ref = normalize(lint.max(axis=-1)) t2 = time.time() delta = abs(ref - res).max() if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Conversion RGB ->float took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start))) logger.info("Reduction stage1 took %.3fms" % (1e-6 * (k2.profile.end - k2.profile.start))) logger.info("Reduction stage2 took %.3fms" % (1e-6 * (k3.profile.end - k3.profile.start))) logger.info("Normalization %.3fms" % (1e-6 * (k4.profile.end - k4.profile.start))) logger.info("--------------------------------------") self.assert_(delta < 1e-4, "delta=%s" % delta)
def setUp(self): kernel_src = os.linesep.join(get_opencl_code(os.path.join("sift", i)) for i in ("sift.cl", "algebra.cl")) self.program = pyopencl.Program(self.ctx, kernel_src).build() self.wg_compact = kernel_workgroup_size(self.program, "compact")
def test_transform(self): ''' tests transform kernel ''' if (USE_LENA): # original image if hasattr(scipy.misc, "ascent"): image = scipy.misc.ascent().astype(numpy.float32) else: image = scipy.misc.lena().astype(numpy.float32) image = numpy.ascontiguousarray(image[0:512, 0:512]) # transformation angle = 1.9 # numpy.pi/5.0 # matrix = numpy.array([[numpy.cos(angle),-numpy.sin(angle)],[numpy.sin(angle),numpy.cos(angle)]],dtype=numpy.float32) # offset_value = numpy.array([1000.0, 100.0],dtype=numpy.float32) # matrix = numpy.array([[0.9,0.2],[-0.4,0.9]],dtype=numpy.float32) # offset_value = numpy.array([-20.0,256.0],dtype=numpy.float32) matrix = numpy.array([[1.0, -0.75], [0.7, 0.5]], dtype=numpy.float32) offset_value = numpy.array([250.0, -150.0], dtype=numpy.float32) image2 = scipy.ndimage.interpolation.affine_transform(image, matrix, offset=offset_value, order=1, mode="constant") else: # use images of a stack image = scipy.misc.imread("/home/paleo/Titanium/test/frame0.png") image2 = scipy.misc.imread("/home/paleo/Titanium/test/frame1.png") offset_value = numpy.array([0.0, 0.0], dtype=numpy.float32) image_height, image_width = image.shape image2_height, image2_width = image2.shape fill_value = numpy.float32(0.0) mode = numpy.int32(1) if IMAGE_RESHAPE: # turns out that image should always be reshaped output_height, output_width = int(3000), int(3000) image, image_height, image_width = self.image_reshape(image, output_height, output_width, image_height, image_width) image2, image2_height, image2_width = self.image_reshape(image2, output_height, output_width, image2_height, image2_width) else: output_height, output_width = int(image_height * numpy.sqrt(2)), int(image_width * numpy.sqrt(2)) logger.info("Image : (%s, %s) -- Output: (%s, %s)", image_height, image_width, output_height, output_width) # perform correction by least square sol, MSE = self.matching_correction(image, image2) logger.info(sol) correction_matrix = numpy.zeros((2, 2), dtype=numpy.float32) correction_matrix[0] = sol[0:2, 0] correction_matrix[1] = sol[3:5, 0] matrix_for_gpu = correction_matrix.reshape(4, 1) # for float4 struct offset_value[0] = sol[2, 0] offset_value[1] = sol[5, 0] maxwg = kernel_workgroup_size(self.program,"transform") wg = maxwg, 1 shape = calc_size((output_width, output_height), wg) gpu_image = pyopencl.array.to_device(self.queue, image2) gpu_output = pyopencl.array.empty(self.queue, (output_height, output_width), dtype=numpy.float32, order="C") gpu_matrix = pyopencl.array.to_device(self.queue, matrix_for_gpu) gpu_offset = pyopencl.array.to_device(self.queue, offset_value) image_height, image_width = numpy.int32((image_height, image_width)) output_height, output_width = numpy.int32((output_height, output_width)) t0 = time.time() k1 = self.program.transform(self.queue, shape, wg, gpu_image.data, gpu_output.data, gpu_matrix.data, gpu_offset.data, image_width, image_height, output_width, output_height, fill_value, mode) res = gpu_output.get() t1 = time.time() # logger.info(res[0,0] ref = scipy.ndimage.interpolation.affine_transform(image2, correction_matrix, offset=offset_value, output_shape=(output_height, output_width), order=1, mode="constant", cval=fill_value) t2 = time.time() delta = abs(res - image) delta_arg = delta.argmax() delta_max = delta.max() # delta_mse_res = ((res-image)**2).sum()/image.size # delta_mse_ref = ((ref-image)**2).sum()/image.size at_0, at_1 = delta_arg / output_width, delta_arg % output_width logger.info("Max error: %f at (%d, %d)", delta_max, at_0, at_1) # print("Mean Squared Error Res/Original : %f" %(delta_mse_res)) # print("Mean Squared Error Ref/Original: %f" %(delta_mse_ref)) logger.info("minimal MSE according to least squares : %f", MSE) # logger.info(res[at_0,at_1] # logger.info(ref[at_0,at_1] if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms.", 1000.0 * (t2 - t1), 1000.0 * (t1 - t0)) logger.info("Transformation took %.3fms", 1e-6 * (k1.profile.end - k1.profile.start))
def test_descriptor(self): ''' #tests keypoints descriptors creation kernel ''' if self.abort: return # Descriptor_setup keypoints_o, nb_keypoints, actual_nb_keypoints, grad, ori, octsize = descriptor_setup( ) # keypoints should be a compacted vector of keypoints keypoints_o, compact_cnt = my_compact(numpy.copy(keypoints_o), nb_keypoints) actual_nb_keypoints = compact_cnt keypoints_start, keypoints_end = 0, actual_nb_keypoints keypoints = keypoints_o[ keypoints_start:keypoints_end + 2] # to check if we actually stop at keypoints_end logger.info( "Working on keypoints : [%s,%s] (octave = %s)" % (keypoints_start, keypoints_end - 1, int(numpy.log2(octsize) + 1))) # Prepare kernel call wg = self.wg_keypoint if len(wg) == 1: shape = keypoints.shape[0] * wg[0], else: shape = keypoints.shape[0] * wg[0], wg[1], wg[2] kernel = self.program_keypoint.all_kernels()[0] # kernel_name = kernel.name max_wg = kernel_workgroup_size(self.program_keypoint, kernel) if max_wg < wg[0]: logger.warning( "test_descriptor: Skipping test of WG=%s when maximum for this kernel is %s ", wg, max_wg) return gpu_keypoints = pyopencl.array.to_device(self.queue, keypoints) gpu_descriptors = pyopencl.array.zeros( self.queue, (keypoints_end - keypoints_start, 128), dtype=numpy.uint8, order="C") gpu_grad = pyopencl.array.to_device(self.queue, grad) gpu_ori = pyopencl.array.to_device(self.queue, ori) keypoints_start, keypoints_end = numpy.int32( keypoints_start), numpy.int32(keypoints_end) grad_height, grad_width = numpy.int32(grad.shape) counter = pyopencl.array.to_device(self.queue, keypoints_end) kargs = [ gpu_keypoints.data, gpu_descriptors.data, gpu_grad.data, gpu_ori.data, numpy.int32(octsize), keypoints_start, counter.data, grad_width, grad_height ] # Call the kernel t0 = time.time() k1 = kernel(self.queue, shape, wg, *kargs) try: res = gpu_descriptors.get() except (pyopencl.LogicError, RuntimeError) as error: logger.warning( "Segmentation fault like error (%s) on Descriptor for %s" % (error, self.param)) return t1 = time.time() # Reference Python implementation ref = my_descriptor(keypoints_o, grad, ori, octsize, keypoints_start, keypoints_end) ref_sort = ref[numpy.argsort(keypoints[keypoints_start:keypoints_end, 1])] t2 = time.time() res_sort = res[numpy.argsort(keypoints[keypoints_start:keypoints_end, 1])] logger.info(res_sort[5:10]) logger.info(ref_sort[5:10]) logger.info("Comparing descriptors (OpenCL and cpp) :") match, nulldesc = descriptors_compare( ref[keypoints_start:keypoints_end], res) logger.info(("%s/%s match found", match, (keypoints_end - keypoints_start) - nulldesc)) if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms.", 1000.0 * (t2 - t1), 1000.0 * (t1 - t0)) logger.info("Descriptors computation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_orientation(self): ''' #tests keypoints orientation assignment kernel ''' if self.abort: return # orientation_setup : keypoints, nb_keypoints, updated_nb_keypoints, grad, ori, octsize = orientation_setup( ) keypoints, compact_cnt = my_compact(numpy.copy(keypoints), nb_keypoints) updated_nb_keypoints = compact_cnt logger.info("Number of keypoints before orientation assignment : %s", updated_nb_keypoints) # Prepare kernel call wg = self.wg_orient kernel = self.program_orient.all_kernels()[0] max_wg = kernel_workgroup_size(self.program_orient, kernel) if max_wg < wg[0]: logger.warning( "test_orientation: Skipping test of WG=%s when maximum for this kernel is %s ", wg, max_wg) return shape = keypoints.shape[0] * wg[ 0], # shape = calc_size(keypoints.shape, self.wg) gpu_keypoints = pyopencl.array.to_device(self.queue, keypoints) actual_nb_keypoints = numpy.int32(updated_nb_keypoints) gpu_grad = pyopencl.array.to_device(self.queue, grad) gpu_ori = pyopencl.array.to_device(self.queue, ori) orisigma = numpy.float32(1.5) # SIFT grad_height, grad_width = numpy.int32(grad.shape) keypoints_start = numpy.int32(0) keypoints_end = numpy.int32(actual_nb_keypoints) counter = pyopencl.array.to_device( self.queue, keypoints_end) # actual_nb_keypoints) kargs = [ gpu_keypoints.data, gpu_grad.data, gpu_ori.data, counter.data, octsize, orisigma, nb_keypoints, keypoints_start, keypoints_end, grad_width, grad_height ] if not self.USE_CPU: kargs += [ pyopencl.LocalMemory(36 * 4), pyopencl.LocalMemory(128 * 4), pyopencl.LocalMemory(128 * 4) ] # Call the kernel t0 = time.time() k1 = kernel(self.queue, shape, wg, *kargs) res = gpu_keypoints.get() cnt = counter.get() t1 = time.time() # Reference Python implemenattion ref, updated_nb_keypoints = my_orientation(keypoints, nb_keypoints, keypoints_start, keypoints_end, grad, ori, octsize, orisigma) t2 = time.time() # sort to compare added keypoints upbound = min(cnt, updated_nb_keypoints) d1, d2, d3, d4 = keypoints_compare(ref[0:upbound], res[0:upbound]) self.assertLess(d1, 1e-4, "delta_cols=%s" % (d1)) self.assertLess(d2, 1e-4, "delta_rows=%s" % (d2)) self.assertLess(d3, 1e-4, "delta_sigma=%s" % (d3)) self.assertLess(d4, 1e-1, "delta_angle=%s" % (d4)) # orientation has a poor precision logger.info("delta_cols=%s" % d1) logger.info("delta_rows=%s" % d2) logger.info("delta_sigma=%s" % d3) logger.info("delta_angle=%s" % d4) if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Orientation assignment took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_descriptor(self): ''' #tests keypoints descriptors creation kernel ''' if self.abort: return # Descriptor_setup keypoints_o, nb_keypoints, actual_nb_keypoints, grad, ori, octsize = descriptor_setup() # keypoints should be a compacted vector of keypoints keypoints_o, compact_cnt = my_compact(numpy.copy(keypoints_o), nb_keypoints) actual_nb_keypoints = compact_cnt keypoints_start, keypoints_end = 0, actual_nb_keypoints keypoints = keypoints_o[keypoints_start:keypoints_end + 2] # to check if we actually stop at keypoints_end logger.info("Working on keypoints : [%s,%s] (octave = %s)" % (keypoints_start, keypoints_end - 1, int(numpy.log2(octsize) + 1))) # Prepare kernel call wg = self.wg_keypoint if len(wg) == 1: shape = keypoints.shape[0] * wg[0], else: shape = keypoints.shape[0] * wg[0], wg[1], wg[2] kernel = self.program_keypoint.all_kernels()[0] # kernel_name = kernel.name max_wg = kernel_workgroup_size(self.program_keypoint, kernel) if max_wg < wg[0]: logger.warning("test_descriptor: Skipping test of WG=%s when maximum for this kernel is %s ", wg, max_wg) return gpu_keypoints = pyopencl.array.to_device(self.queue, keypoints) gpu_descriptors = pyopencl.array.zeros(self.queue, (keypoints_end - keypoints_start, 128), dtype=numpy.uint8, order="C") gpu_grad = pyopencl.array.to_device(self.queue, grad) gpu_ori = pyopencl.array.to_device(self.queue, ori) keypoints_start, keypoints_end = numpy.int32(keypoints_start), numpy.int32(keypoints_end) grad_height, grad_width = numpy.int32(grad.shape) counter = pyopencl.array.to_device(self.queue, keypoints_end) kargs = [ gpu_keypoints.data, gpu_descriptors.data, gpu_grad.data, gpu_ori.data, numpy.int32(octsize), keypoints_start, counter.data, grad_width, grad_height ] # Call the kernel t0 = time.time() k1 = kernel(self.queue, shape, wg, *kargs) try: res = gpu_descriptors.get() except (pyopencl.LogicError, RuntimeError) as error: logger.warning("Segmentation fault like error (%s) on Descriptor for %s" % (error, self.param)) return t1 = time.time() # Reference Python implementation ref = my_descriptor(keypoints_o, grad, ori, octsize, keypoints_start, keypoints_end) ref_sort = ref[numpy.argsort(keypoints[keypoints_start: keypoints_end, 1])] t2 = time.time() res_sort = res[numpy.argsort(keypoints[keypoints_start:keypoints_end, 1])] logger.info(res_sort[5:10]) logger.info(ref_sort[5:10]) logger.info("Comparing descriptors (OpenCL and cpp) :") match, nulldesc = descriptors_compare(ref[keypoints_start:keypoints_end], res) logger.info(("%s/%s match found", match, (keypoints_end - keypoints_start) - nulldesc)) if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms.", 1000.0 * (t2 - t1), 1000.0 * (t1 - t0)) logger.info("Descriptors computation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_orientation(self): ''' #tests keypoints orientation assignment kernel ''' if self.abort: return # orientation_setup : keypoints, nb_keypoints, updated_nb_keypoints, grad, ori, octsize = orientation_setup( ) keypoints, compact_cnt = my_compact(numpy.copy(keypoints), nb_keypoints) updated_nb_keypoints = compact_cnt # if (USE_CPU): # logger.info("Using CPU-optimized kernels") # wg = 1, # shape = keypoints.shape[0]*wg[0], # else: # wg = 128, #FIXME : have to choose it for histograms #wg = max(self.wg), wg = self.wg_orient shape = keypoints.shape[0] * wg[ 0], # shape = calc_size(keypoints.shape, self.wg) gpu_keypoints = pyopencl.array.to_device(self.queue, keypoints) actual_nb_keypoints = numpy.int32(updated_nb_keypoints) logger.info("Number of keypoints before orientation assignment : %s", actual_nb_keypoints) gpu_grad = pyopencl.array.to_device(self.queue, grad) gpu_ori = pyopencl.array.to_device(self.queue, ori) orisigma = numpy.float32(1.5) # SIFT grad_height, grad_width = numpy.int32(grad.shape) keypoints_start = numpy.int32(0) keypoints_end = numpy.int32(actual_nb_keypoints) counter = pyopencl.array.to_device( self.queue, keypoints_end) # actual_nb_keypoints) max_wg = kernel_workgroup_size(self.program_orient, "orientation_assignment") if max_wg < wg[0]: logger.warning( "test_orientation: Skipping test of WG=%s when maximum for this kernel is %s ", wg, max_wg) return t0 = time.time() k1 = self.program_orient.orientation_assignment( self.queue, shape, wg, gpu_keypoints.data, gpu_grad.data, gpu_ori.data, counter.data, octsize, orisigma, nb_keypoints, keypoints_start, keypoints_end, grad_width, grad_height) res = gpu_keypoints.get() cnt = counter.get() t1 = time.time() if (USE_CPP_SIFT): import feature sc = feature.SiftAlignment() ref2 = sc.sift( self.testdata ) # ref2.x, ref2.y, ref2.scale, ref2.angle, ref2.desc --- ref2[numpy.argsort(ref2.y)]).desc ref = ref2.angle kp_ref = numpy.empty((ref2.size, 4), dtype=numpy.float32) kp_ref[:, 0] = ref2.x kp_ref[:, 1] = ref2.y kp_ref[:, 2] = ref2.scale kp_ref[:, 3] = ref2.angle else: ref, updated_nb_keypoints = my_orientation(keypoints, nb_keypoints, keypoints_start, keypoints_end, grad, ori, octsize, orisigma) t2 = time.time() if (PRINT_KEYPOINTS): # logger.info("Keypoints after orientation assignment :") # logger.info(res[numpy.argsort(res[0:cnt,1])][0:cnt+10,3] #res[0:compact_cnt] pass # logger.info(kp_ref[0:cnt+10] # logger.info("Showing error (NOTE: significant error at position (i) should have its opposite at (i+1))" # logger.info(res[numpy.argsort(res[0:compact_cnt,1])][0:compact_cnt,3] - ref[0:compact_cnt] # logger.info("Total keypoints for kernel : %s -- For Python : %s \t [octsize = %s]" % (cnt, updated_nb_keypoints, octsize)) # logger.info("Opencl found %s keypoints (%s added)" %(cnt,cnt-compact_cnt)) # sort to compare added keypoints upbound = min(cnt, updated_nb_keypoints) d1, d2, d3, d4 = keypoints_compare(ref[0:upbound], res[0:upbound]) self.assert_(d1 < 1e-4, "delta_cols=%s" % (d1)) self.assert_(d2 < 1e-4, "delta_rows=%s" % (d2)) self.assert_(d3 < 1e-4, "delta_sigma=%s" % (d3)) self.assert_(d4 < 1e-1, "delta_angle=%s" % (d4)) # orientation has a poor precision logger.info("delta_cols=%s" % d1) logger.info("delta_rows=%s" % d2) logger.info("delta_sigma=%s" % d3) logger.info("delta_angle=%s" % d4) if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Orientation assignment took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_descriptor(self): ''' #tests keypoints descriptors creation kernel ''' if self.abort: return # descriptor_setup : keypoints_o, nb_keypoints, actual_nb_keypoints, grad, ori, octsize = descriptor_setup( ) # keypoints should be a compacted vector of keypoints keypoints_o, compact_cnt = my_compact(numpy.copy(keypoints_o), nb_keypoints) actual_nb_keypoints = compact_cnt keypoints_start, keypoints_end = 0, actual_nb_keypoints keypoints = keypoints_o[ keypoints_start:keypoints_end + 52] # to check if we actually stop at keypoints_end logger.info( "Working on keypoints : [%s,%s] (octave = %s)" % (keypoints_start, keypoints_end - 1, int(numpy.log2(octsize) + 1))) if not (USE_CPP_SIFT) and (100 < keypoints_end - keypoints_start): logger.info( "NOTE: Python implementation of descriptors is slow. Do not handle more than 100 keypoints, or grab a coffee..." ) # if (self.USE_CPU): # logger.info("Using CPU-optimized kernels" # wg = 1, # shape = keypoints.shape[0]*wg[0], # else: # wg = (8, 8, 8) # shape = int(keypoints.shape[0]*wg[0]), 8, 8 # wg = (4, 4, 8) # shape = int(keypoints.shape[0]*wg[0]), 4, 8 wg = self.wg_keypoint if len(wg) == 1: shape = keypoints.shape[0] * wg[0], else: shape = keypoints.shape[0] * wg[0], wg[1], wg[2] gpu_keypoints = pyopencl.array.to_device(self.queue, keypoints) # NOTE: for the following line, use pyopencl.array.empty instead of pyopencl.array.zeros if the keypoints are compacted gpu_descriptors = pyopencl.array.zeros( self.queue, (keypoints_end - keypoints_start, 128), dtype=numpy.uint8, order="C") gpu_grad = pyopencl.array.to_device(self.queue, grad) gpu_ori = pyopencl.array.to_device(self.queue, ori) keypoints_start, keypoints_end = numpy.int32( keypoints_start), numpy.int32(keypoints_end) grad_height, grad_width = numpy.int32(grad.shape) counter = pyopencl.array.to_device(self.queue, keypoints_end) max_wg = kernel_workgroup_size(self.program_keypoint, "descriptor") if max_wg < wg[0]: logger.warning( "test_descriptor: Skipping test of WG=%s when maximum for this kernel is %s ", wg, max_wg) return t0 = time.time() k1 = self.program_keypoint.descriptor(self.queue, shape, wg, gpu_keypoints.data, gpu_descriptors.data, gpu_grad.data, gpu_ori.data, numpy.int32(octsize), keypoints_start, counter.data, grad_width, grad_height) try: res = gpu_descriptors.get() except (pyopencl.LogicError, RuntimeError) as error: logger.warning( "Segmentation fault like error (%s) on Descriptor for %s" % (error, self.param)) return t1 = time.time() if (USE_CPP_SIFT): import feature sc = feature.SiftAlignment() ref2 = sc.sift( self.testdata ) # ref2.x, ref2.y, ref2.scale, ref2.angle, ref2.desc --- ref2[numpy.argsort(ref2.y)]).desc ref = ref2.desc ref_sort = ref else: ref = my_descriptor(keypoints_o, grad, ori, octsize, keypoints_start, keypoints_end) ref_sort = ref[numpy.argsort( keypoints[keypoints_start:keypoints_end, 1])] t2 = time.time() if (PRINT_KEYPOINTS): res_sort = res[numpy.argsort( keypoints[keypoints_start:keypoints_end, 1])] logger.info(res_sort[5:10]) # keypoints_end-keypoints_start,0:15] # logger.info(res_sort[9] pass logger.info(ref_sort[5:10]) # numpy.savetxt("grrr_ocl_4_3.txt",res_sort,fmt='%d') # numpy.savetxt("grrr_cpp_4_3.txt",ref_sort,fmt='%d') # logger.info(ref[50:80,0:15]#[0:keypoints_end-keypoints_start,0:15] if (USE_CPP_SIFT and octsize == 1) or not ( USE_CPP_SIFT ): # this comparison is only relevant for the first keypoints logger.info("Comparing descriptors (OpenCL and cpp) :") match, nulldesc = descriptors_compare( ref[keypoints_start:keypoints_end], res) logger.info(("%s/%s match found", match, (keypoints_end - keypoints_start) - nulldesc)) # logger.info(ref[1,:] # logger.info(res[1,:].sum(), ref[1,:].sum() # append to existing text file # f_handle = file('desc_by_test_keypoints.txt', 'a') # numpy.savetxt(f_handle,res_sort,fmt='%d') # f_handle.close() ''' For now, the descriptor kernel is not precise enough to get exactly the same descriptors values (we have several difference of 1, but it is OK for the SIFT matching). Use descriptors_compare(ref,res) to count how many descriptors are exactly the same. #sort to compare added keypoints delta = abs(res_sort-ref_sort).max() self.assert_(delta <= 1, "delta=%s" % (delta)) logger.info("delta=%s" % delta) ''' if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms.", 1000.0 * (t2 - t1), 1000.0 * (t1 - t0)) logger.info("Descriptors computation took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))
def test_transform(self): ''' tests transform kernel ''' # Transformation # --------------- matrix = numpy.array([[1.0, -0.75], [0.7, 0.5]], dtype=numpy.float32) offset_value = numpy.array([250.0, -150.0], dtype=numpy.float32) transformation = lambda img: scipy.ndimage.interpolation.affine_transform(img, matrix, offset=offset_value, order=1, mode="constant") image_transformed = transformation(self.image) fill_value = numpy.float32(0.0) mode = numpy.int32(1) # computing keypoints matching with SIFT sift_plan = SiftPlan(template=self.image, block_size=self.maxwg) kp1 = sift_plan.keypoints(self.image) kp2 = sift_plan.keypoints(image_transformed) # image2 and image must have the same size match_plan = MatchPlan() # cls.ctx matching = match_plan.match(kp2, kp1) # Retrieve the linear transformation from the matching pairs sol = matching_correction(matching) logger.info(sol) # Compute the correction matrix (inverse of transformation) correction_matrix = numpy.zeros((2, 2), dtype=numpy.float32) correction_matrix[0] = sol[0:2, 0] correction_matrix[1] = sol[3:5, 0] matrix_for_gpu = correction_matrix.reshape(4, 1) # for float4 struct offset_value[0] = sol[2, 0] offset_value[1] = sol[5, 0] # Prepare the arguments for the "transform" kernel call maxwg = kernel_workgroup_size(self.program, "transform") wg = maxwg, 1 shape = calc_size(self.image.shape[::-1], wg) gpu_image = pyopencl.array.to_device(self.queue, image_transformed) gpu_output = pyopencl.array.empty(self.queue, self.image.shape, dtype=numpy.float32, order="C") gpu_matrix = pyopencl.array.to_device(self.queue, matrix_for_gpu) gpu_offset = pyopencl.array.to_device(self.queue, offset_value) image_height, image_width = numpy.int32(self.image.shape) output_height, output_width = numpy.int32(gpu_output.shape) kargs = [ gpu_image.data, gpu_output.data, gpu_matrix.data, gpu_offset.data, image_width, image_height, output_width, output_height, fill_value, mode ] # Call the kernel t0 = time.time() k1 = self.program.transform(self.queue, shape, wg, *kargs) res = gpu_output.get() # Reference result t1 = time.time() ref = scipy.ndimage.interpolation.affine_transform(image_transformed, correction_matrix, offset=offset_value, output_shape=(output_height, output_width), order=1, mode="constant", cval=fill_value) t2 = time.time() # Compare the implementations delta = numpy.abs(res - ref) delta_arg = delta.argmax() delta_max = delta.max() at_0, at_1 = delta_arg / output_width, delta_arg % output_width logger.info("Max difference wrt scipy : %f at (%d, %d)", delta_max, at_0, at_1) if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms.", 1000.0 * (t2 - t1), 1000.0 * (t1 - t0)) logger.info("Transformation took %.3fms", 1e-6 * (k1.profile.end - k1.profile.start))
def test_transform(self): ''' tests transform kernel ''' # Transformation # --------------- matrix = numpy.array([[1.0, -0.75], [0.7, 0.5]], dtype=numpy.float32) offset_value = numpy.array([250.0, -150.0], dtype=numpy.float32) transformation = lambda img: scipy.ndimage.interpolation.affine_transform( img, matrix, offset=offset_value, order=1, mode="constant") image_transformed = transformation(self.image) fill_value = numpy.float32(0.0) mode = numpy.int32(1) # computing keypoints matching with SIFT sift_plan = SiftPlan(template=self.image, max_workgroup_size=self.maxwg) kp1 = sift_plan.keypoints(self.image) kp2 = sift_plan.keypoints( image_transformed) # image2 and image must have the same size match_plan = MatchPlan() # cls.ctx matching = match_plan.match(kp2, kp1) # Retrieve the linear transformation from the matching pairs sol = matching_correction(matching) logger.info(sol) # Compute the correction matrix (inverse of transformation) correction_matrix = numpy.zeros((2, 2), dtype=numpy.float32) correction_matrix[0] = sol[0:2, 0] correction_matrix[1] = sol[3:5, 0] matrix_for_gpu = correction_matrix.reshape(4, 1) # for float4 struct offset_value[0] = sol[2, 0] offset_value[1] = sol[5, 0] # Prepare the arguments for the "transform" kernel call maxwg = kernel_workgroup_size(self.program, "transform") wg = maxwg, 1 shape = calc_size(self.image.shape[::-1], wg) gpu_image = pyopencl.array.to_device(self.queue, image_transformed) gpu_output = pyopencl.array.empty(self.queue, self.image.shape, dtype=numpy.float32, order="C") gpu_matrix = pyopencl.array.to_device(self.queue, matrix_for_gpu) gpu_offset = pyopencl.array.to_device(self.queue, offset_value) image_height, image_width = numpy.int32(self.image.shape) output_height, output_width = numpy.int32(gpu_output.shape) kargs = [ gpu_image.data, gpu_output.data, gpu_matrix.data, gpu_offset.data, image_width, image_height, output_width, output_height, fill_value, mode ] # Call the kernel t0 = time.time() k1 = self.program.transform(self.queue, shape, wg, *kargs) res = gpu_output.get() # Reference result t1 = time.time() ref = scipy.ndimage.interpolation.affine_transform( image_transformed, correction_matrix, offset=offset_value, output_shape=(output_height, output_width), order=1, mode="constant", cval=fill_value) t2 = time.time() # Compare the implementations delta = numpy.abs(res - ref) delta_arg = delta.argmax() delta_max = delta.max() at_0, at_1 = delta_arg / output_width, delta_arg % output_width logger.info("Max difference wrt scipy : %f at (%d, %d)", delta_max, at_0, at_1) if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms.", 1000.0 * (t2 - t1), 1000.0 * (t1 - t0)) logger.info("Transformation took %.3fms", 1e-6 * (k1.profile.end - k1.profile.start))
def test_orientation(self): ''' #tests keypoints orientation assignment kernel ''' if self.abort: return # orientation_setup : keypoints, nb_keypoints, updated_nb_keypoints, grad, ori, octsize = orientation_setup() keypoints, compact_cnt = my_compact(numpy.copy(keypoints), nb_keypoints) updated_nb_keypoints = compact_cnt logger.info("Number of keypoints before orientation assignment : %s", updated_nb_keypoints) # Prepare kernel call wg = self.wg_orient kernel = self.program_orient.all_kernels()[0] max_wg = kernel_workgroup_size(self.program_orient, kernel) if max_wg < wg[0]: logger.warning("test_orientation: Skipping test of WG=%s when maximum for this kernel is %s ", wg, max_wg) return shape = keypoints.shape[0] * wg[0], # shape = calc_size(keypoints.shape, self.wg) gpu_keypoints = pyopencl.array.to_device(self.queue, keypoints) actual_nb_keypoints = numpy.int32(updated_nb_keypoints) gpu_grad = pyopencl.array.to_device(self.queue, grad) gpu_ori = pyopencl.array.to_device(self.queue, ori) orisigma = numpy.float32(1.5) # SIFT grad_height, grad_width = numpy.int32(grad.shape) keypoints_start = numpy.int32(0) keypoints_end = numpy.int32(actual_nb_keypoints) counter = pyopencl.array.to_device(self.queue, keypoints_end) # actual_nb_keypoints) kargs = [ gpu_keypoints.data, gpu_grad.data, gpu_ori.data, counter.data, octsize, orisigma, nb_keypoints, keypoints_start, keypoints_end, grad_width, grad_height ] if not self.USE_CPU: kargs += [pyopencl.LocalMemory(36 * 4), pyopencl.LocalMemory(128 * 4), pyopencl.LocalMemory(128 * 4)] # Call the kernel t0 = time.time() k1 = kernel(self.queue, shape, wg, *kargs) res = gpu_keypoints.get() cnt = counter.get() t1 = time.time() # Reference Python implemenattion ref, updated_nb_keypoints = my_orientation(keypoints, nb_keypoints, keypoints_start, keypoints_end, grad, ori, octsize, orisigma) t2 = time.time() # sort to compare added keypoints upbound = min(cnt, updated_nb_keypoints) d1, d2, d3, d4 = keypoints_compare(ref[0:upbound], res[0:upbound]) self.assert_(d1 < 1e-4, "delta_cols=%s" % (d1)) self.assert_(d2 < 1e-4, "delta_rows=%s" % (d2)) self.assert_(d3 < 1e-4, "delta_sigma=%s" % (d3)) self.assert_(d4 < 1e-1, "delta_angle=%s" % (d4)) # orientation has a poor precision logger.info("delta_cols=%s" % d1) logger.info("delta_rows=%s" % d2) logger.info("delta_sigma=%s" % d3) logger.info("delta_angle=%s" % d4) if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Orientation assignment took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start)))