# Create a queue for transferring data and launching computations.
        # Turn on profiling to allow us to check event times.
        queue = cl.CommandQueue(context, context.devices[0],
                                properties=cl.command_queue_properties.PROFILING_ENABLE)
        program = cl.Program(context, open('harris_corner_detection_v1.cl').read()).build(options='')

       
        #Load in image to be analyzed
        host_image = np.array(Image.open(image).convert('L')).astype(np.float32)[::1, ::1].copy()

        #start time after image load for consistancy
        start = time.time()
     
        sigma = 1 #Define the standard deviation for the gauussian
        #Generate the 1D first dimensional gaussian kernel
        filter_kernel_derivative = np.asarray(generate_weights(sigma), order = 1).astype(np.float32)
        #Generate the 1D zero derivative gaussian kernel
        filter_kernel_zero = np.asarray(generate_weights(sigma, order = 0)).astype(np.float32)
        #Determine the length of the entire weight vector based on the sigma of the gaussian
        weight_length = len(filter_kernel_derivative) #should be 9 with sigma = 1
        #This is the number of neighbors for each analyzed pixel, should be even number
        window = (weight_length - 1) #window is 8



        host_image_filtered = np.zeros_like(host_image)
        gpu_image_in = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
        zero_derivative_out = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
        first_derivative_out = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
        derivative_kernel_x = cl.Buffer(context, cl.mem_flags.READ_WRITE, filter_kernel_derivative.size * 4)
        zero_kernel = cl.Buffer(context, cl.mem_flags.READ_WRITE, filter_kernel_zero.size * 4)
def harris_get_corners(num_runs = 100, image_file = '1.tif'):

	'''
	This funciton is the final and most efficent implementation of our parallel harris corner detection implemenatation in openCL
	
	INPUTS:
	image_file1: <TYPE STRING> the image to be analyzed
	num_runs: <TYPE INT> number of iterations to have the algorithm run, for profiling against serial version

	OUTPUT:
	1) two images saved to your directory with points labled, one image is from serial version and one from parallel version
	2) the harris points from the serial and parallel implementations
	3) average of times from serial and parallel implemenatations
	'''
	output_times_openCL = np.zeros(num_runs)
	output_times_serial = np.zeros(num_runs)


    #Initalize loop to get average of times
	for i in range(num_runs):
        # List our platforms
		platforms = cl.get_platforms()

		#Define the number of runs to get average of run times



		# Create a context with all the devices
		devices = platforms[0].get_devices()
		context = cl.Context(devices)

		# Create a queue for transferring data and launching computations.
		# Turn on profiling to allow us to check event times.
		queue = cl.CommandQueue(context, context.devices[0],
		                        properties=cl.command_queue_properties.PROFILING_ENABLE)
		program = cl.Program(context, open('harris_corner_detection_final.cl').read()).build(options='')


		#Load in image to be analyzed
		host_image = np.array(Image.open(str(image_file)).convert('L')).astype(np.float32)[::1, ::1].copy()

		#start time after image load for consistancy
		start = time.time()

		sigma = 1 #Define the standard deviation for the gauussian
		#Generate the 1D first dimensional gaussian kernel
		filter_kernel_derivative = np.asarray(generate_weights(sigma), order = 1).astype(np.float32)
		#Generate the 1D zero derivative gaussian kernel
		filter_kernel_zero = np.asarray(generate_weights(sigma, order = 0)).astype(np.float32)
		#Determine the length of the entire weight vector based on the sigma of the gaussian
		weight_length = len(filter_kernel_derivative) #should be 9 with sigma = 1
		#This is the number of neighbors for each analyzed pixel, should be even number
		window = (weight_length - 1) #window is 8
		#the halo is the number of nieghbors on each side of the analyzed pixelsd
		halo = np.int32(window / 2.)



		host_image_filtered = np.zeros_like(host_image)
		gpu_image_in = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
		zero_derivative_out = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
		first_derivative_out = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
		derivative_kernel_x = cl.Buffer(context, cl.mem_flags.READ_WRITE, filter_kernel_derivative.size * 4)
		zero_kernel = cl.Buffer(context, cl.mem_flags.READ_WRITE, filter_kernel_zero.size * 4)

		Harris_Matrix = np.zeros_like(host_image)

		# Intermediate storage area, between Derivative of Gaussian and Gaussian Filter

		local_size = (int(halo), int(halo)) # 2D local_size
		global_size = tuple([round_up(g, l) for g, l in zip(host_image.shape[::-1], local_size)]) # shape

		width = np.int32(host_image.shape[1])
		height = np.int32(host_image.shape[0])

		local_memory = cl.LocalMemory(4 * ((local_size[0] + (halo * 2)) * (local_size[1] + 1)))
		local_buffer_zero_1 = cl.LocalMemory(4 * (np.shape(filter_kernel_zero)[0] + 1))   
		local_buffer_first_1 = cl.LocalMemory(4 * (np.shape(filter_kernel_derivative)[0] + 1))      

		buf_width = np.int32(local_size[0] + window)
		buf_height = np.int32(local_size[1] + window)

		cl.enqueue_copy(queue, gpu_image_in, host_image, is_blocking=False)
		cl.enqueue_copy(queue, derivative_kernel_x, filter_kernel_derivative, is_blocking=False)
		cl.enqueue_copy(queue, zero_kernel, filter_kernel_zero, is_blocking=False)

		########################################### First Kernel ##################################
		#                          This Kernel takes the first derivative of a guasisan           #
		#                  of the image in the y-direction (axis = 0) and zero Derivatives        #
		#                         of a gaussian in the y-direction (axis = 0)                     #
		########################################### First Kernel ##################################

		#Execute Derivative of Gaussian Function
		program.gaussian_first_axis(

		                    queue, global_size, local_size,
		                    gpu_image_in, 
		                    zero_derivative_out, 
		                    first_derivative_out, 
		                    local_memory, width, 
		                    height, buf_width, buf_height, halo, 
		                    derivative_kernel_x, zero_kernel,
		                    local_buffer_first_1, local_buffer_zero_1
		                    )


		########################################### Second Kernel ##################################
		#                          This Kernel takes the first derivative of a guasisan            #
		#                  of the image in the x-direction (axis = 1) and zero Derivatives         #
		#                         of a gaussian in the x-direction (axis = 1)                      #
		########################################### Second Kernel ##################################


		#allocate local memory buffers for the two filters used in the second kernel
		local_memory_axis2_1 = cl.LocalMemory(4 * ((local_size[0]) * (local_size[1] + (halo * 2) + 1)))
		local_memory_axis2_2 = cl.LocalMemory(4 * ((local_size[0]) * (local_size[1] + (halo * 2) + 1)))
		local_buffer_zero_2 = cl.LocalMemory(4 * (np.shape(filter_kernel_zero)[0] + 1))   
		local_buffer_first_2 = cl.LocalMemory(4 * (np.shape(filter_kernel_derivative)[0] + 1))   
		#allocate memory for the output of the second kernel 
		gpu_image_Wxx_derivative_out = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
		gpu_image_Wyy_derivative_out = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
		gpu_image_Wxy_derivative_out = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)

		#Execute Derivative of Gaussian Function
		program.gaussian_second_axis(

		                    queue, global_size, local_size, 
		                    zero_derivative_out, 
		                    first_derivative_out, 
		                    gpu_image_Wxx_derivative_out, 
		                    gpu_image_Wyy_derivative_out, 
		                    gpu_image_Wxy_derivative_out,
		                    local_memory_axis2_1, local_memory_axis2_2,
		                    width, 
		                    height, buf_width, buf_height, halo, 
		                    derivative_kernel_x, zero_kernel,
		                    local_buffer_first_2, local_buffer_zero_2

		                    )

		########################################### Third Kernel ###################################
		#                          This Kernel applies a gaussian to the product of the            #
		#                          parital derivatives in the y-direction (axis = 0)               #
		#                                                                                          #
		########################################### Third Kernel ###################################


		#load in the local memory buffer allocation for all the compents fo the harris matrix
		local_memory_filter_Wxx = cl.LocalMemory(4 * ((local_size[0] + (halo * 2)) * (local_size[1] + 1)))
		local_memory_filter_Wyy = cl.LocalMemory(4 * ((local_size[0] + (halo * 2)) * (local_size[1] + 1)))
		local_memory_filter_Wxy = cl.LocalMemory(4 * ((local_size[0] + (halo * 2)) * (local_size[1] + 1)))
		local_buffer_zero_3 = cl.LocalMemory(4 * (np.shape(filter_kernel_zero)[0] + 1))   


		#allocate memory for the output of the third kernel 
		gpu_image_Wxx_third_out = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
		gpu_image_Wyy_third_out = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
		gpu_image_Wxy_third_out = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)


		# Execute gaussian filter on all component matrices and calculate Harris Matrix
		program.filter_first_axis_second_pass(

		                    queue, global_size, local_size, 
		                    gpu_image_Wxx_derivative_out, 
		                    gpu_image_Wyy_derivative_out, 
		                    gpu_image_Wxy_derivative_out,
		                    gpu_image_Wxx_third_out, 
		                    gpu_image_Wyy_third_out, 
		                    gpu_image_Wxy_third_out, 
		                    local_memory_filter_Wxx, 
		                    local_memory_filter_Wyy, 
		                    local_memory_filter_Wxy,
		                    halo, width, height, buf_width, buf_height, 
		                    zero_kernel, local_buffer_zero_3

		                    )


		########################################### Fourth Kernel ##################################
		#                          This Kernel applies a gaussian to the product of the            #
		#                          parital derivatives in the x-direction (axis = 1)               #
		#                          and computes the final Harris Matrix for the output             #
		########################################### Fourth Kernel ##################################

		#Allocate local memory buffer for fourth kernel
		local_memory_filter_Wxx_2 = cl.LocalMemory(4 * ((local_size[0]) * (local_size[1] + (halo * 2)) + 1))
		local_memory_filter_Wyy_2 = cl.LocalMemory(4 * ((local_size[0]) * (local_size[1] + (halo * 2)) + 1))
		local_memory_filter_Wxy_2 = cl.LocalMemory(4 * ((local_size[0]) * (local_size[1] + (halo * 2)) + 1))
		local_buffer_zero_4 = cl.LocalMemory(4 * (np.shape(filter_kernel_zero)[0]))   

		# Allocate memory to store output from fourth kernel, this is the final Harris Matrix
		gpu_image_filter_out = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)

		program.filter_second_axis_second_pass(

		                    queue, global_size, local_size, 
		                    gpu_image_Wxx_third_out, 
		                    gpu_image_Wyy_third_out, 
		                    gpu_image_Wxy_third_out, 
		                    gpu_image_filter_out,
		                    local_memory_filter_Wxx_2, 
		                    local_memory_filter_Wyy_2, 
		                    local_memory_filter_Wxy_2,
		                    halo, width, height, buf_width, buf_height, 
		                    zero_kernel, local_buffer_zero_4

		                    )



		#Output the final Harris Matrix to the CPU
		cl.enqueue_copy(queue, Harris_Matrix, gpu_image_filter_out, is_blocking=False)
		points = get_harris_points(Harris_Matrix)
		end = time.time()

		#Store the time to run the entire openCL version
		output_times_openCL[i] = end - start

		#Store the time to run the entire serial version
		with Timer() as serial_time:
		    harris = run_harris(host_image)
		output_times_serial[i] = serial_time.interval



	########################################################################
	# test comparision for accuracy vs. harris.py Serial implementation by #
	# "Programming Computer Vision with Python"  by Jan Erik Solem         #
	########################################################################

	print '-------------Check Plots: Saved to the Directory--------------------------'
	plot_harris_points(host_image, points, im_name = 'Harris openCL Image')
	response = compute_harris_response(host_image, sigma=1)
	serial_points = get_harris_points(response, min_dist=10, threshold=0.1)
	plot_harris_points(host_image, serial_points, im_name = 'Harris Serial Image')
	print '--------------------------------------------------------------------------'

	print '-------------Check For Correctness----------------------------------------'
	pt_x = np.random.randint(np.shape(host_image)[0])
	pt_y = np.random.randint(np.shape(host_image)[1])
	print 'openCL Harris Matrix Random Point Check:', Harris_Matrix[pt_x, pt_y]
	print 'Serial Baseline Harris Matrix Random Point Check:', response[pt_x, pt_y]
	print 'Number of openCL points:', np.shape(points)
	print 'Number of Serial Points:', np.shape(serial_points)
	print 'Are the two lists of corner points the same?', (np.array(serial_points) == np.array(points)).all()
	print '--------------------------------------------------------------------------'



	print '-------------Check Timing Comparision-------------------------------------'
	mean_openCL = output_times_openCL.mean()
	mean_serial = output_times_serial.mean()
	print 'Time to run openCL', mean_openCL
	print 'Time to run Serial', mean_serial
	print '--------------------------------------------------------------------------'


	return Harris_Matrix, serial_points, points, mean_openCL, mean_serial