def sort(self, num, keys, values, batch=1, direction=1): print "bitonic sort" #num must be a power of 2 and <= max_num log2l, remainder = self.factorRadix2(num) if remainder != 1: return #self.keys = keys #self.values = values self.keys = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=keys) self.values = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=values) cl.enqueue_read_buffer(self.queue, self.keys, keys) cl.enqueue_read_buffer(self.queue, self.values, values) self.queue.finish() direction = (direction != 0) array_length = keys.size print "array_length", array_length if array_length < self.local_size_limit: self.local(array_length, direction) else: self.local1(batch, array_length, direction) size = 2 * self.local_size_limit while size <= array_length: stride = size / 2 while stride > 0: print "size, stride", size, stride if stride >= self.local_size_limit: self.merge_global(batch, array_length, stride, size, direction) else: self.merge_local(batch, array_length, size, stride, direction) break stride >>= 1 size <<= 1 self.queue.finish() #need to copy back cl.enqueue_copy_buffer(self.queue, self.d_tempKeys, self.keys).wait() cl.enqueue_copy_buffer(self.queue, self.d_tempValues, self.values).wait() self.queue.finish() #copy to cpu to view results cl.enqueue_read_buffer(self.queue, self.keys, keys) cl.enqueue_read_buffer(self.queue, self.values, values) self.queue.finish() #cl.enqueue_read_buffer(self.queue, self.d_tempKeys, keys).wait() #cl.enqueue_read_buffer(self.queue, self.d_tempValues, values).wait() return keys, values
def copyBuffer(self, buf, dest=None): if dest is None: buf_copy = self.allocate(buf.shape, buf.dtype) else: buf_copy = dest cl.enqueue_copy_buffer(self.queue, buf, buf_copy) if dest is None: return buf_copy
def sort(self, num, keys, values, batch=1, direction=1): print "bitonic sort" # num must be a power of 2 and <= max_num log2l, remainder = self.factorRadix2(num) if remainder != 1: return # self.keys = keys # self.values = values self.keys = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=keys) self.values = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=values) cl.enqueue_read_buffer(self.queue, self.keys, keys) cl.enqueue_read_buffer(self.queue, self.values, values) self.queue.finish() direction = direction != 0 array_length = keys.size print "array_length", array_length if array_length < self.local_size_limit: self.local(array_length, direction) else: self.local1(batch, array_length, direction) size = 2 * self.local_size_limit while size <= array_length: stride = size / 2 while stride > 0: print "size, stride", size, stride if stride >= self.local_size_limit: self.merge_global(batch, array_length, stride, size, direction) else: self.merge_local(batch, array_length, size, stride, direction) break stride >>= 1 size <<= 1 self.queue.finish() # need to copy back cl.enqueue_copy_buffer(self.queue, self.d_tempKeys, self.keys).wait() cl.enqueue_copy_buffer(self.queue, self.d_tempValues, self.values).wait() self.queue.finish() # copy to cpu to view results cl.enqueue_read_buffer(self.queue, self.keys, keys) cl.enqueue_read_buffer(self.queue, self.values, values) self.queue.finish() # cl.enqueue_read_buffer(self.queue, self.d_tempKeys, keys).wait() # cl.enqueue_read_buffer(self.queue, self.d_tempValues, values).wait() return keys, values
def adjust_weights( self, context ): """ Adjust weights of neural network by certain direction. """ context.opencl.kernel_adjust_weights_quickprop( context.opencl.queue, ( int( context._weights_buf_size ), ), context._gradient_buf, self.prev_direction_buf, self.n, self.alpha, self._weights_delta_buf, context._weights_buf ) pyopencl.enqueue_copy_buffer( context.opencl.queue, context._gradient_buf, self.prev_direction_buf )
def test_copy_buffer(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) mf = cl.mem_flags a = np.random.rand(50000).astype(np.float32) b = np.empty_like(a) buf1 = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) buf2 = cl.Buffer(context, mf.WRITE_ONLY, b.nbytes) cl.enqueue_copy_buffer(queue, buf1, buf2).wait() cl.enqueue_read_buffer(queue, buf2, b).wait() assert la.norm(a - b) == 0
def adjust_weights( self, context ): """ Adjust weights of neural network by certain direction. """ context.opencl.kernel_adjust_weights_rprop( context.opencl.queue, ( int( context._weights_buf_size ), ), context._gradient_buf, self.prev_gradient_buf, self.n_buf, context._weights_buf ) #nn = numpy.ndarray( [context.weights_buf_size], numpy.float32 ) #pyopencl.enqueue_read_buffer( context.opencl.queue, context.gradient_buf, nn, is_blocking = True ) #pyopencl.enqueue_read_buffer( context.opencl.queue, self.n_buf, nn, is_blocking = True ) pyopencl.enqueue_copy_buffer( context.opencl.queue, context._gradient_buf, self.prev_gradient_buf )
def copyBuffer(self, cl_queue, cl_buffer): """ Copying the given device buffer into the already allocated memory """ if not self.holds_data: raise RuntimeError('The buffer has been freed before copyBuffer is called') if not cl_buffer.holds_data: raise RuntimeError('The provided cl_buffer is either not allocated, or has been freed before copyBuffer is called') # Make sure that the input is of correct size: assert(cl_buffer.nx_halo == self.nx_halo), str(cl_buffer.nx_halo) + " vs " + str(self.nx_halo) assert(cl_buffer.ny_halo == self.ny_halo), str(cl_buffer.ny_halo) + " vs " + str(self.ny_halo) assert(cl_buffer.bytes_per_float == self.bytes_per_float), "Provided cl_buffer itemsize is " + str(cl_buffer.bytes_per_float) + ", but should have been " + str(self.bytes_per_float) # Okay, everything is fine - issue device-to-device-copy: total_num_bytes = self.bytes_per_float*self.nx_halo*self.ny_halo pyopencl.enqueue_copy_buffer(cl_queue, cl_buffer.data, self.data, total_num_bytes)
def setStartingCoordinates(self,dev_initialMembraneCoordinatesX,dev_initialMembraneCoordinatesY, \ dev_initialMembranNormalVectorsX,dev_initialMembranNormalVectorsY): cl.enqueue_copy_buffer(self.queue, dev_initialMembraneCoordinatesX.data, self.dev_membraneCoordinatesX.data).wait() #<- cl.enqueue_copy_buffer(self.queue, dev_initialMembraneCoordinatesY.data, self.dev_membraneCoordinatesY.data).wait() cl.enqueue_copy_buffer(self.queue, dev_initialMembranNormalVectorsX.data, self.dev_membraneNormalVectorsX.data).wait() cl.enqueue_copy_buffer(self.queue, dev_initialMembranNormalVectorsY.data, self.dev_membraneNormalVectorsY.data).wait() barrierEvent = cl.enqueue_barrier(self.queue) self.queue.finish()
def setStartingMembraneNormals(self, dev_initialMembranNormalVectorsX, dev_initialMembranNormalVectorsY): if self.resetNormalsAfterEachImage and not self.getContourId( ) == 0: # reset contour normal vector to radial vectors; we do this only starting for the second, since doing this for image 0, would destroy the correspondence of the indexes of the contour coordinates to their corresponding contour normals cl.enqueue_copy_buffer( self.queue, self.dev_radialVectorsX.data, self.dev_membraneNormalVectorsX.data).wait() cl.enqueue_copy_buffer( self.queue, self.dev_radialVectorsY.data, self.dev_membraneNormalVectorsY.data).wait() else: # copy contour normal vectors from last image to use as initial normal vectors for next image cl.enqueue_copy_buffer( self.queue, dev_initialMembranNormalVectorsX.data, self.dev_membraneNormalVectorsX.data).wait() cl.enqueue_copy_buffer( self.queue, dev_initialMembranNormalVectorsY.data, self.dev_membraneNormalVectorsY.data).wait() barrierEvent = cl.enqueue_barrier(self.queue)
def process(self): """ Process signal by this layer. Invokes OpenCL program that produces output array in background. """ # ensure that all previous layers are processed for l in self._prev_layers: if not l[0].processed: return outbuf = self.context._outputs_buf inbuf = self.context._inputs_buf queue = self.opencl.queue i_s = 0 for l in self._prev_layers: self._process_wait_for.append( pyopencl.enqueue_copy_buffer( queue, outbuf, inbuf, byte_count=int(l[2] * 4), src_offset=int((l[0]._neurons_offset + l[1]) * 4), dst_offset=int((self._inputs_offset + i_s) * 4), wait_for=(l[0]._process_event, ))) i_s += l[2] #process layer kernel = self.opencl.kernel_process_layer kernel.set_arg(2, self._inputs_offset) kernel.set_arg(3, self._weights_offset) kernel.set_arg(4, self._neurons_offset) kernel.set_arg(5, self._inputs_per_neuron) kernel.set_arg(6, self._neuron_count) self._process_event = pyopencl.enqueue_nd_range_kernel( queue, kernel, (int(self._neuron_count * 64), ), (64, ), wait_for=self._process_wait_for) del self._process_wait_for[:] self._processed = True for l in self._next_layers: l[0].process()
def setStartingCoordinatesNew(self, dev_initialMembraneCoordinatesX, dev_initialMembraneCoordinatesY): cl.enqueue_copy_buffer(self.queue, dev_initialMembraneCoordinatesX.data, self.dev_membraneCoordinatesX.data).wait() #<- cl.enqueue_copy_buffer(self.queue, dev_initialMembraneCoordinatesY.data, self.dev_membraneCoordinatesY.data).wait() #cl.enqueue_copy_buffer(self.queue,dev_initialMembraneCoordinatesX.data,self.dev_interpolatedMembraneCoordinatesX.data).wait() #cl.enqueue_copy_buffer(self.queue,dev_initialMembraneCoordinatesY.data,self.dev_interpolatedMembraneCoordinatesY.data).wait() cl.enqueue_copy_buffer( self.queue, dev_initialMembraneCoordinatesX.data, self.dev_previousInterpolatedMembraneCoordinatesX.data).wait() cl.enqueue_copy_buffer( self.queue, dev_initialMembraneCoordinatesY.data, self.dev_previousInterpolatedMembraneCoordinatesY.data).wait() barrierEvent = cl.enqueue_barrier(self.queue)
def process( self ): """ Process signal by this layer. Invokes OpenCL program that produces output array in background. """ # ensure that all previous layers are processed for l in self._prev_layers: if not l[0].processed: return outbuf = self.context._outputs_buf inbuf = self.context._inputs_buf queue = self.opencl.queue i_s = 0 for l in self._prev_layers: self._process_wait_for.append( pyopencl.enqueue_copy_buffer( queue, outbuf, inbuf, byte_count = int( l[2] * 4 ), src_offset = int( ( l[0]._neurons_offset + l[1] ) * 4 ), dst_offset = int( ( self._inputs_offset + i_s ) * 4 ), wait_for = ( l[0]._process_event, ) ) ) i_s += l[2] #process layer kernel = self.opencl.kernel_process_layer kernel.set_arg( 2, self._inputs_offset ) kernel.set_arg( 3, self._weights_offset ) kernel.set_arg( 4, self._neurons_offset ) kernel.set_arg( 5, self._inputs_per_neuron ) kernel.set_arg( 6, self._neuron_count ) self._process_event = pyopencl.enqueue_nd_range_kernel( queue, kernel, ( int( self._neuron_count * 64 ), ), ( 64, ), wait_for = self._process_wait_for ) del self._process_wait_for[:] self._processed = True for l in self._next_layers: l[0].process()
def eval(self, calcs): """ Evaluate each calc and store in the k list if necessary """ ncalcs = len(calcs) particles = self.particles k_num = 'k' + str(self.cstep) for i in range(ncalcs): calc = calcs[i] queue = calc.queue updates = calc.updates nupdates = calc.nupdates # get the destination particle array for this calc pa = self.arrays[calc.dnum] # Evaluate the calc. The result is stored in cl_tmpx, cl_tmpy, ... calc.sph() pa.read_from_buffer() for j in range(nupdates): update_prop = updates[j] step_prop = self.step_props[j] #step_array = pa.get(step_prop) step_prop_buffer = pa.get_cl_buffer(step_prop) if not calc.integrates: update_prop_buffer = pa.get_cl_buffer(update_prop) cl.enqueue_copy_buffer(queue, src=step_prop_buffer, dest=update_prop_buffer).wait() # ensure that all processes have reached this point particles.barrier() # update neighbor information if 'h' has been updated if calc.tag == "h": particles.update() # update the remote particle properties self.rupdate_list[calc.dnum] = [update_prop] particles.update_remote_particle_properties( self.rupdate_list) else: k_prop = self.k_props[calc.id][k_num][j] k_prop_buffer = pa.get_cl_buffer(k_prop) cl.enqueue_copy( queue, src=step_prop_buffer, dest=k_prop_buffer, ).wait() pass #ensure that the eval phase is completed for all processes particles.barrier()
def eval(self, calcs): """ Evaluate each calc and store in the k list if necessary """ ncalcs = len(calcs) particles = self.particles k_num = 'k' + str(self.cstep) for i in range(ncalcs): calc = calcs[i] queue = calc.queue updates = calc.updates nupdates = calc.nupdates # get the destination particle array for this calc pa = self.arrays[calc.dnum] # Evaluate the calc. The result is stored in cl_tmpx, cl_tmpy, ... calc.sph() pa.read_from_buffer() for j in range(nupdates): update_prop = updates[j] step_prop = self.step_props[j] #step_array = pa.get(step_prop) step_prop_buffer = pa.get_cl_buffer(step_prop) if not calc.integrates: update_prop_buffer = pa.get_cl_buffer(update_prop) cl.enqueue_copy_buffer(queue, src=step_prop_buffer, dest=update_prop_buffer).wait() # ensure that all processes have reached this point particles.barrier() # update neighbor information if 'h' has been updated if calc.tag == "h": particles.update() # update the remote particle properties self.rupdate_list[calc.dnum] = [update_prop] particles.update_remote_particle_properties( self.rupdate_list) else: k_prop = self.k_props[calc.id][k_num][j] k_prop_buffer = pa.get_cl_buffer(k_prop) cl.enqueue_copy(queue, src=step_prop_buffer, dest=k_prop_buffer, ).wait() pass #ensure that the eval phase is completed for all processes particles.barrier()
# Main loop print('main loop') from datetime import datetime mf = cl.mem_flags for i, nx in enumerate(seed_nxs): in_host = in_rand[:nx] out_host = np.zeros_like(in_host) in_gpu = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=in_host) out_gpu = cl.Buffer(context, mf.WRITE_ONLY, hostbuf=in_host) queue.finish() t0 = datetime.now() for j in xrange(memcpy_iterations): #prg.copy_gpu(queue, a.shape, None, np.int32(nx), in_gpu, out_gpu) cl.enqueue_copy_buffer(queue, in_gpu, out_gpu) queue.finish() dt = datetime.now() - t0 elapsed_times[i] = (dt.seconds + dt.microseconds * 1e-6) / 2 / memcpy_iterations cl.enqueue_read_buffer(queue, out_gpu, out_host) assert np.linalg.norm(in_host - out_host) == 0 del in_host, out_host in_gpu.release() out_gpu.release() #print('%d/%d (%d %%)\r' % (i, num_samples, float(i)/num_samples*100)), #sys.stdout.flush() print('%d/%d (%d %%) dt = %g sec, nx = %d (%d Mbytes)' % (i, num_samples, float(i)/num_samples*100, elapsed_times[i], nx, nx*4/(1024**2)))
def reloadData(self): import pyopencl as cl cl.enqueue_acquire_gl_objects(self.queue, self.gl_objects) cl.enqueue_copy_buffer(self.queue, self.pos_cl, self.pos_n1_cl).wait() cl.enqueue_copy_buffer(self.queue, self.pos_cl, self.pos_n2_cl).wait() cl.enqueue_release_gl_objects(self.queue, self.gl_objects)
def trackContour(self): if self.resetNormalsAfterEachImage and not self.getContourId( ) == 0 and self.nrOfTrackingIterations == 0: # reset contour normal vector to radial vectors; we do this only starting for the second, since doing this for image 0, would destroy the correspondence of the indexes of the contour coordinates to their corresponding contour normals cl.enqueue_copy_buffer( self.queue, self.dev_radialVectorsX.data, self.dev_membraneNormalVectorsX.data).wait() cl.enqueue_copy_buffer( self.queue, self.dev_radialVectorsY.data, self.dev_membraneNormalVectorsY.data).wait() # tracking status variables self.nrOfTrackingIterations = self.nrOfTrackingIterations + 1 stopInd = 1 self.trackingFinished = np.array(1, dtype=np.int32) # True self.dev_trackingFinished = cl_array.to_device(self.queue, self.trackingFinished) self.iterationFinished = np.array(0, dtype=np.int32) # True self.dev_iterationFinished = cl_array.to_device( self.queue, self.iterationFinished) self.dev_membraneCoordinates = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_membraneCoordinatesX, self.dev_membraneCoordinatesY) self.dev_membraneNormalVectors = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_membraneNormalVectorsX, self.dev_membraneNormalVectorsY) self.dev_previousInterpolatedMembraneCoordinates = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_previousInterpolatedMembraneCoordinatesX, self.dev_previousInterpolatedMembraneCoordinatesY) self.dev_membranePolarCoordinates = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_membranePolarTheta, self.dev_membranePolarRadius) self.dev_interpolatedMembraneCoordinates = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_interpolatedMembraneCoordinatesX, self.dev_interpolatedMembraneCoordinatesY) for strideNr in range(self.nrOfStrides): # set the starting index of the coordinate array for each kernel instance kernelCoordinateStartingIndex = np.int32( strideNr * self.detectionKernelStrideSize) self.prg.findMembranePosition(self.queue, self.trackingGlobalSize, self.trackingWorkGroupSize, self.sampler, \ self.dev_Img, self.imgSizeX, self.imgSizeY, \ self.buf_localRotationMatrices, \ self.buf_linFitSearchRangeXvalues, \ self.linFitParameter, \ cl.LocalMemory(self.fitIntercept_memSize), cl.LocalMemory(self.fitIncline_memSize), \ cl.LocalMemory(self.rotatedUnitVector_memSize), \ self.meanParameter, \ self.buf_meanRangeXvalues, self.meanRangePositionOffset, \ cl.LocalMemory(self.localMembranePositions_memSize), \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ self.dev_fitInclines.data, \ kernelCoordinateStartingIndex, \ self.inclineTolerance, \ self.inclineRefinementRange) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.filterNanValues(self.queue, self.gradientGlobalSize, None, \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes) \ ) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.filterJumpedCoordinates(self.queue, self.gradientGlobalSize, None, \ self.dev_previousContourCenter.data, \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ self.dev_previousInterpolatedMembraneCoordinates.data, \ cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), \ cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes), \ cl.LocalMemory(self.listOfGoodCoordinates_memSize), \ self.maxCoordinateShift \ ) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.calculateInterCoordinateAngles(self.queue, self.gradientGlobalSize, None, \ self.dev_interCoordinateAngles.data, \ self.dev_membraneCoordinates.data \ ) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.filterIncorrectCoordinates(self.queue, self.gradientGlobalSize, None, \ self.dev_previousContourCenter.data, \ self.dev_interCoordinateAngles.data, \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ cl.LocalMemory(self.dev_closestLowerNoneNanIndex.nbytes), cl.LocalMemory(self.dev_closestUpperNoneNanIndex.nbytes), \ self.maxInterCoordinateAngle \ ) barrierEvent = cl.enqueue_barrier(self.queue) # information regarding barriers: http://stackoverflow.com/questions/13200276/what-is-the-difference-between-clenqueuebarrier-and-clfinish ######################################################################## ### Calculate contour center ######################################################################## self.calculateContourCenter() ######################################################################## ### Convert cartesian coordinates to polar coordinates ######################################################################## self.prg.cart2pol(self.queue, self.gradientGlobalSize, None, \ self.dev_membraneCoordinates.data, \ self.dev_membranePolarCoordinates.data, \ self.dev_contourCenter.data) barrierEvent = cl.enqueue_barrier(self.queue) ######################################################################## ### Interpolate polar coordinates ######################################################################## self.prg.sortCoordinates(self.queue, (1,1), None, \ self.dev_membranePolarCoordinates.data, \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ np.int32(self.nrOfDetectionAngleSteps) \ ) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.interpolatePolarCoordinatesLinear(self.queue, self.gradientGlobalSize, None, \ self.dev_membranePolarCoordinates.data, \ self.dev_radialVectors.data, \ self.dev_contourCenter.data, \ self.dev_membraneCoordinates.data, \ self.dev_interpolatedMembraneCoordinates.data, \ self.dev_interpolationAngles.data, \ self.nrOfAnglesToCompare \ ) barrierEvent = cl.enqueue_barrier(self.queue) ######################################################################## ### Convert polar coordinates to cartesian coordinates ######################################################################## self.prg.checkIfTrackingFinished(self.queue, self.gradientGlobalSize, None, \ self.dev_interpolatedMembraneCoordinates.data, \ self.dev_previousInterpolatedMembraneCoordinates.data, \ self.dev_trackingFinished.data, \ self.coordinateTolerance) barrierEvent = cl.enqueue_barrier(self.queue) self.prg.checkIfCenterConverged(self.queue, (1,1), None, \ self.dev_contourCenter.data, \ self.dev_previousContourCenter.data, \ self.dev_trackingFinished.data, \ self.centerTolerance) barrierEvent = cl.enqueue_barrier(self.queue) self.dev_membraneNormalVectorsX, self.dev_membraneNormalVectorsY = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_membraneNormalVectors) self.dev_previousInterpolatedMembraneCoordinatesX, self.dev_previousInterpolatedMembraneCoordinatesY = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_previousInterpolatedMembraneCoordinates) self.dev_membraneCoordinatesX, self.dev_membraneCoordinatesY = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_membraneCoordinates) self.dev_membranePolarTheta, self.dev_membranePolarRadius = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_membranePolarCoordinates) self.dev_interpolatedMembraneCoordinatesX, self.dev_interpolatedMembraneCoordinatesY = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_interpolatedMembraneCoordinates) cl.enqueue_read_buffer(self.queue, self.dev_trackingFinished.data, self.trackingFinished).wait() barrierEvent = cl.enqueue_barrier(self.queue) cl.enqueue_copy_buffer( self.queue, self.dev_interpolatedMembraneCoordinatesX.data, self.dev_previousInterpolatedMembraneCoordinatesX.data).wait() cl.enqueue_copy_buffer( self.queue, self.dev_interpolatedMembraneCoordinatesY.data, self.dev_previousInterpolatedMembraneCoordinatesY.data).wait() cl.enqueue_copy_buffer(self.queue, self.dev_contourCenter.data, self.dev_previousContourCenter.data).wait() self.prg.setIterationFinished(self.queue, (1, 1), None, self.dev_iterationFinished.data) barrierEvent = cl.enqueue_barrier(self.queue) cl.enqueue_read_buffer(self.queue, self.dev_iterationFinished.data, self.iterationFinished).wait() self.setStartingCoordinatesNew(self.dev_interpolatedMembraneCoordinatesX, \ self.dev_interpolatedMembraneCoordinatesY) pass
def trackContourSequentially(self): ## tracking status variables #self.trackingFinished = np.int32(1) # True #self.iterationFinished = np.int32(1) # True for coordinateIndex in range(int(self.nrOfDetectionAngleSteps)): coordinateIndex = np.int32(coordinateIndex) angle = self.angleStepSize * np.float64(coordinateIndex + 1) radiusVectorRotationMatrix = np.array( [[np.cos(angle), -np.sin(angle)], [np.sin(angle), np.cos(angle)]]) self.dev_membraneNormalVectors = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_membraneNormalVectorsX, self.dev_membraneNormalVectorsY) self.dev_membraneCoordinates = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_membraneCoordinatesX, self.dev_membraneCoordinatesY) self.prg.findMembranePosition(self.queue, self.global_size, self.local_size, self.sampler, \ self.dev_Img, self.imgSizeX, self.imgSizeY, \ self.buf_localRotationMatrices, \ self.buf_linFitSearchRangeXvalues, \ self.linFitParameter, \ cl.LocalMemory(self.fitIntercept_memSize), cl.LocalMemory(self.fitIncline_memSize), \ cl.LocalMemory(self.rotatedUnitVector_memSize), \ self.meanParameter, \ self.buf_meanRangeXvalues, self.meanRangePositionOffset, \ cl.LocalMemory(self.localMembranePositions_memSize), \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ self.dev_fitInclines.data, \ coordinateIndex, \ self.inclineTolerance, \ self.inclineRefinementRange) barrierEvent = cl.enqueue_barrier(self.queue) self.dev_membraneCoordinatesX, self.dev_membraneCoordinatesY = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_membraneCoordinates) self.dev_membraneNormalVectorsX, self.dev_membraneNormalVectorsY = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_membraneNormalVectors) cl.enqueue_read_buffer(self.queue, self.dev_membraneCoordinatesX.data, self.host_membraneCoordinatesX).wait() cl.enqueue_read_buffer(self.queue, self.dev_membraneCoordinatesY.data, self.host_membraneCoordinatesY).wait() cl.enqueue_read_buffer(self.queue, self.dev_membraneNormalVectorsX.data, self.host_membraneNormalVectorsX).wait() cl.enqueue_read_buffer(self.queue, self.dev_membraneNormalVectorsY.data, self.host_membraneNormalVectorsY).wait() currentMembraneCoordinate = np.array([ self.host_membraneCoordinatesX[coordinateIndex], self.host_membraneCoordinatesY[coordinateIndex] ]) radiusVector = currentMembraneCoordinate - self.rotationCenterCoordinate radiusVectorNorm = np.sqrt(radiusVector[0]**2 + radiusVector[1]**2) rotatedRadiusUnitVector = radiusVectorRotationMatrix.dot( self.radiusUnitVector) nextMembranePosition = self.rotationCenterCoordinate + rotatedRadiusUnitVector * radiusVectorNorm nextMembraneNormalVector = np.array([ self.host_membraneNormalVectorsX[coordinateIndex], self.host_membraneNormalVectorsY[coordinateIndex] ]) if coordinateIndex < self.host_membraneCoordinatesX.shape[0] - 1: self.host_membraneCoordinatesX[coordinateIndex + 1] = nextMembranePosition[0] self.host_membraneCoordinatesY[coordinateIndex + 1] = nextMembranePosition[1] self.host_membraneNormalVectorsX[ coordinateIndex + 1] = nextMembraneNormalVector[0] self.host_membraneNormalVectorsY[ coordinateIndex + 1] = nextMembraneNormalVector[1] self.dev_membraneCoordinatesX = cl_array.to_device( self.queue, self.host_membraneCoordinatesX) self.dev_membraneCoordinatesY = cl_array.to_device( self.queue, self.host_membraneCoordinatesY) self.dev_membraneNormalVectorsX = cl_array.to_device( self.queue, self.host_membraneNormalVectorsX) self.dev_membraneNormalVectorsY = cl_array.to_device( self.queue, self.host_membraneNormalVectorsY) # calculate new normal vectors self.dev_membraneCoordinates = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_membraneCoordinatesX, self.dev_membraneCoordinatesY) self.dev_membraneNormalVectors = helpers.ToDoubleVectorOnDevice( self.queue, self.dev_membraneNormalVectorsX, self.dev_membraneNormalVectorsY) self.prg.calculateMembraneNormalVectors(self.queue, self.gradientGlobalSize, None, \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data \ ) self.calculateContourCenter() self.dev_membraneCoordinatesX, self.dev_membraneCoordinatesY = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_membraneCoordinates) self.dev_membraneNormalVectorsX, self.dev_membraneNormalVectorsY = helpers.ToSingleVectorsOnDevice( self.queue, self.dev_membraneNormalVectors) cl.enqueue_copy_buffer( self.queue, self.dev_membraneCoordinatesX.data, self.dev_interpolatedMembraneCoordinatesX.data).wait() cl.enqueue_copy_buffer( self.queue, self.dev_membraneCoordinatesY.data, self.dev_interpolatedMembraneCoordinatesY.data).wait() cl.enqueue_copy_buffer( self.queue, self.dev_membraneCoordinatesX.data, self.dev_previousInterpolatedMembraneCoordinatesX.data).wait() cl.enqueue_copy_buffer( self.queue, self.dev_membraneCoordinatesY.data, self.dev_previousInterpolatedMembraneCoordinatesY.data).wait() self.setStartingCoordinatesNew(self.dev_interpolatedMembraneCoordinatesX, \ self.dev_interpolatedMembraneCoordinatesY) self.queue.finish()
def setContourCenter(self, dev_initialContourCenter): cl.enqueue_copy_buffer(self.queue, dev_initialContourCenter.data, self.dev_previousContourCenter.data).wait() pass
def start_training( self, context, training_data, training_results, maximal_iterations = 10000, target_error = 0.01, report = False ): """ Starts training. @param context Input layer of neural network. @param training_data Array of tuples of inputs and outputs. @param training_results TrainingResults structure where optimal results will be stored. @param maximal_iterations Maximal iteration to perform. @param target_error Target absolute error. @param report Report object (optimal) @return Tuple of performed iterations count, minimal relative error """ start_time = time.clock() self.prepare_training( context ) total_error = numpy.array( [1e12], numpy.float32 ) total_error_buf = pyopencl.Buffer( context.opencl.context, pyopencl.mem_flags.READ_WRITE | pyopencl.mem_flags.COPY_HOST_PTR, hostbuf = total_error ) zeros_buf = pyopencl.Buffer( context.opencl.context, pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.COPY_HOST_PTR, hostbuf = numpy.zeros( [context._weights_buf_size], numpy.float32 ) ) read_ready_event = None o_buf = pyopencl.Buffer( context.opencl.context, pyopencl.mem_flags.READ_ONLY | pyopencl.mem_flags.COPY_HOST_PTR, hostbuf = numpy.zeros( [context.output_layer.neuron_count], numpy.float32 ) ) context.opencl.kernel_setup_training_data.set_arg( 0, context._neurons_buf_size ) context.opencl.kernel_setup_training_data.set_arg( 1, context._outputs_buf ) context.opencl.kernel_setup_training_data.set_arg( 2, context.output_layer._neurons_offset ) context.opencl.kernel_setup_training_data.set_arg( 3, context.output_layer.neuron_count ) context.opencl.kernel_setup_training_data.set_arg( 4, o_buf ) context.opencl.kernel_setup_training_data.set_arg( 5, pyopencl.LocalMemory( 32 * 4 ) ) context.opencl.kernel_setup_training_data.set_arg( 6, context._errors_backpropagation_buf ) context.opencl.kernel_setup_training_data.set_arg( 7, total_error_buf ) # clear gradient pyopencl.enqueue_copy_buffer( context.opencl.queue, zeros_buf, context._gradient_buf ).wait() i = 0 calc_error_evt = None while training_results.minimal_error > target_error: if i >= maximal_iterations: break i += 1 reset_total_error_evt = pyopencl.enqueue_copy_buffer( context.opencl.queue, zeros_buf, total_error_buf, byte_count = 4 ) j = 0 for inputs, outputs in training_data: j += 1 # pyopencl.enqueue_barrier( context.opencl.queue ) evt = context.input_layer.set_inputs( inputs, is_blocking = False ) context.input_layer._process_wait_for.append( evt ) context.input_layer.process() evt = pyopencl.enqueue_write_buffer( context.opencl.queue, o_buf, outputs, is_blocking = False ) calc_error_evt = pyopencl.enqueue_nd_range_kernel( context.opencl.queue, context.opencl.kernel_setup_training_data, ( 32, ), ( 32, ), wait_for = ( evt, context.output_layer._process_event, reset_total_error_evt ) ) # print context.output_layer.get_outputs() context.output_layer._calc_gradient_wait_for.append( calc_error_evt ) context.input_layer.calc_weights_gradient() #print context.output_layer._get_gradient( ) if not self.offline: self.adjust_weights( context ) evt = pyopencl.enqueue_copy_buffer( context.opencl.queue, zeros_buf, context._gradient_buf, wait_for = ( context.input_layer._calc_gradient_event, ) ) context.output_layer._calc_gradient_wait_for.append( evt ) if j % 20000 == 0: context.opencl.queue.finish() if self.offline: save_n = self.n self.n /= numpy.float32( len( training_data ) ) self.adjust_weights( context ) self.n = save_n evt = pyopencl.enqueue_copy_buffer( context.opencl.queue, zeros_buf, context._gradient_buf ) context.output_layer._calc_gradient_wait_for.append( evt ) if read_ready_event and read_ready_event.command_execution_status == pyopencl.command_execution_status.COMPLETE: read_ready_event = None error_sum = total_error[0] / len( training_data ) # print error_sum, ' ', i, ' ', self.n if report: report.process_iteration( len( training_data ), self, training_results, error_sum, context ) self.adjust_training_parameters( error_sum ) if error_sum < training_results.minimal_error: training_results.minimal_error = error_sum training_results.store_weights( context ) # note: this call is blocking! if error_sum < target_error: break; training_results.opencl_time += context.opencl.gather_opencl_stats() if not read_ready_event: # we use nonblocking read to avoid waiting for GPU # this could lead to a delay in obtaining current error # error of current iteration can be returned in several iteration ahead read_ready_event = pyopencl.enqueue_read_buffer( context.opencl.queue, total_error_buf, total_error, is_blocking = False, wait_for = ( calc_error_evt, ) if calc_error_evt else None ) training_results.iterations += i pyopencl.enqueue_read_buffer( context.opencl.queue, total_error_buf, total_error, is_blocking = True, wait_for = ( calc_error_evt, ) if calc_error_evt else None ) error_sum = total_error[0] / len( training_data ) if error_sum < training_results.minimal_error: training_results.minimal_error = error_sum training_results.store_weights( context ) training_results.opencl_time += context.opencl.gather_opencl_stats() training_results.total_time += time.clock() - start_time
def test_overwrite_ecb(): cl.enqueue_copy_buffer(queue, cl_zero_buffer, cl_empty_buffer, zero_buffer.nbytes).wait()
zero_buffer = np.zeros(200048, dtype=cl.array.vec.uint2) empty_buffer = np.empty(200048, dtype=cl.array.vec.uint2) empty_buffer.fill(16) mf = cl.mem_flags cl_zero_buffer = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=zero_buffer) cl_empty_buffer = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=empty_buffer) zero = np.zeros(2, np.uint32) # make sure both buffers are initialised cl.enqueue_copy(queue, empty_buffer, cl_empty_buffer) print(empty_buffer) cl.enqueue_copy_buffer(queue, cl_zero_buffer, cl_empty_buffer, zero_buffer.nbytes).wait() cl.enqueue_copy(queue, empty_buffer, cl_empty_buffer) print(empty_buffer) @timeit_repeat(reps) def test_overwrite_ecb(): cl.enqueue_copy_buffer(queue, cl_zero_buffer, cl_empty_buffer, zero_buffer.nbytes).wait() @timeit_repeat(reps) def test_overwrite_efb(): cl.enqueue_fill_buffer(queue, cl_empty_buffer, zero, 0, zero_buffer.nbytes).wait()