Пример #1
0
	def compute(self):
		glMatrixMode(GL_MODELVIEW)
		glPushMatrix()
		glLoadIdentity()
		self.applySceneTransforms()
		mat = np.array(glGetFloat(GL_MODELVIEW_MATRIX).transpose(), order='C')
		glPopMatrix()
		inv = np.array(np.linalg.inv(mat), order='C')
		
		e1 = cl.enqueue_write_buffer(queue, self.matrix, mat)
		e2 = cl.enqueue_write_buffer(queue, self.inv_matrix, inv)
		
		e3 = self.program.pdbTracer(queue, self.dst.shape[:2], self.dst_buf,
			self.matrix, self.inv_matrix, 
			np.array(len(self.mol.spheres)), self.spheredata,
			self.envmap, self.phimap, self.sampler)
		e4 = cl.enqueue_read_buffer(queue, self.dst_buf, self.dst)
		queue.finish()
		e4.wait()

		for e in [e3]:
			print (e.profile.END - e.profile.START)*1e-9
		
		glBindTexture(GL_TEXTURE_2D, self.dstTex)
		glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, N, N, GL_RGBA, GL_UNSIGNED_BYTE, self.dst)
Пример #2
0
    def update(self, sub_pos, angle, min_dist, max_dist, width, in_weight, out_weight):
        '''
        Perform one update on the probabilities by using the evidence that
        the sub is at position sub_pos, the target is seen at an absolute heading
        of `angle` and is most likely between min_dist and max_dist away.
        in_weight gives the chance that for every point in the region,
        if the buoy is there then we would get this result
        i.e. in_weight = P(this measurement | buoy at point p) for p in our region
        out_weight is the same but for points outside the region
        '''

        n,e = sub_pos
        cl_program.evidence(cl_queue, self.norths.shape, None,
                            self.norths_buf, self.easts_buf, self.prob_buf,
                            float32(n), float32(e),
                            float32(radians(angle)),
                            float32(min_dist**2),
                            float32(max_dist**2),
                            float32(width),
                            float32(in_weight),
                            float32(out_weight))
        #TODO ?
        cl.enqueue_read_buffer(cl_queue, self.prob_buf, self.probabilities).wait()

        #Normalize
        total_prob = numpy.sum( self.probabilities )
        self.probabilities  /= total_prob

        cl.enqueue_write_buffer(cl_queue, self.prob_buf, self.probabilities)
Пример #3
0
 def mineThread(self):
     for data in self.qr:
         for i in range(data.iterations):
             self.kernel.search(
                 self.commandQueue, (data.size, ), (self.WORKSIZE, ),
                 data.state[0], data.state[1], data.state[2], data.state[3],
                 data.state[4], data.state[5], data.state[6], data.state[7],
                 data.state2[1], data.state2[2], data.state2[3],
                 data.state2[5], data.state2[6], data.state2[7],
                 data.base[i],
                 data.f[0],
                 data.f[1],data.f[2],
                 data.f[3],data.f[4],
                 self.output_buf)
             cl.enqueue_read_buffer(
                 self.commandQueue, self.output_buf, self.output)
             self.commandQueue.finish()
             
             # The OpenCL code will flag the last item in the output buffer when
             # it finds a valid nonce. If that's the case, send it to the main
             # thread for postprocessing and clean the buffer for the next pass.
             if self.output[self.OUTPUT_SIZE]:
                 reactor.callFromThread(self.postprocess, self.output.copy(),
                 data.nr)
         
                 self.output.fill(0)
                 cl.enqueue_write_buffer(
                     self.commandQueue, self.output_buf, self.output)
Пример #4
0
	def allocations(s):
		s.eh_fieldss = []
		s.ce_fieldss = []
		mf = cl.mem_flags
		for i, nx in enumerate(s.nxs):
			f = np.zeros((nx, s.ny, s.nz), 'f')
			cf = np.ones_like(f) * 0.5

			if i < s.ngpu:
				s.eh_fieldss.append( [cl.Buffer(s.context, mf.READ_WRITE, f.nbytes) for m in range(6)] ) 
				s.ce_fieldss.append( [cl.Buffer(s.context, mf.READ_ONLY, cf.nbytes) for m in range(3)] )
				for eh_field in s.eh_fieldss[-1]:
					cl.enqueue_write_buffer(s.queues[i], eh_field, f) 
				for ce_field in s.ce_fieldss[-1]:
					cl.enqueue_write_buffer(s.queues[i], ce_field, cf) 
			else:
				s.eh_fieldss.append( [f.copy() for i in xrange(6)] )
				s.ce_fieldss.append( [cf.copy() for i in xrange(3)] )

			del f, cf

		s.offsets = []
		s.tmpfs = []
		for nx in s.nxs:
			s.offsets.append( (nx-1) * s.ny * s.nz * np.nbytes['float32'] )
			s.tmpfs.append( [np.zeros((s.ny, s.nz), dtype=np.float32) for m in range(2)] )
	def __init__(s, fdtd, nx, ny, nz):
		super(TestSetFields, s).__init__(nx, ny, nz)
		s.fdtd = fdtd

		for strf in s.strf_list:
			randarr = np.random.rand(nx, ny, nz).astype(s.fdtd.dtype)
			cl.enqueue_write_buffer(s.fdtd.queue, s.fdtd.get_buffer(strf), randarr)
	def exchange_boundary_h(s):
		for queue, eh_fields, tmpf, offset in zip(s.queues, s.eh_fields_gpus, s.tmpfs, s.offsets)[:-1]:
			cl.enqueue_read_buffer(queue, eh_fields[4], tmpf[0], offset)	# hy_gpu
			cl.enqueue_read_buffer(queue, eh_fields[5], tmpf[1], offset)	# hz_gpu
		for queue, eh_fields, tmpf in zip(s.queues[1:], s.eh_fields_gpus[1:], s.tmpfs[:-1]):
			cl.enqueue_write_buffer(queue, eh_fields[4], tmpf[0])
			cl.enqueue_write_buffer(queue, eh_fields[5], tmpf[1])
	def exchange_boundary_e(s):
		for queue, eh_fields, tmpf in zip(s.queues, s.eh_fields_gpus, s.tmpfs)[1:]:
			cl.enqueue_read_buffer(queue, eh_fields[1], tmpf[0])	# ey_gpu
			cl.enqueue_read_buffer(queue, eh_fields[2], tmpf[1])	# ez_gpu
		for queue, eh_fields, tmpf, offset in zip(s.queues[:-1], s.eh_fields_gpus[:-1], s.tmpfs[1:], s.offsets[:-1]):
			cl.enqueue_write_buffer(queue, eh_fields[1], tmpf[0], offset)
			cl.enqueue_write_buffer(queue, eh_fields[2], tmpf[1], offset)
Пример #8
0
    def mineThread(self):
        for data in self.qr:
            for i in range(data.iterations):
                offset = (unpack('I', data.base[i])[0],) if self.GOFFSET else None
                self.kernel.search(
                    self.commandQueue, (data.size, ), (self.WORKSIZE, ),
                    data.state[0], data.state[1], data.state[2], data.state[3],
                    data.state[4], data.state[5], data.state[6], data.state[7],
                    data.state2[1], data.state2[2], data.state2[3],
                    data.state2[5], data.state2[6], data.state2[7],
                    data.base[i],
                    data.f[0], data.f[1], data.f[2], data.f[3],
                    data.f[4], data.f[5], data.f[6], data.f[7],
                    self.output_buf, global_offset=offset)
                cl.enqueue_read_buffer(self.commandQueue, self.output_buf,
                                       self.output, is_blocking=False)
                self.commandQueue.finish()

                # The OpenCL code will flag the last item in the output buffer
                # when it finds a valid nonce. If that's the case, send it to
                # the main thread for postprocessing and clean the buffer
                # for the next pass.
                if self.output[self.WORKSIZE]:
                    reactor.callFromThread(self.postprocess,
                    self.output.copy(), data.nr)

                    self.output.fill(0)
                    cl.enqueue_write_buffer(self.commandQueue, self.output_buf,
                                            self.output, is_blocking=False)
	def __init__(s, fdtd, nx, ny, nz):
		super(TestGetFields, s).__init__(nx, ny, nz)
		s.fdtd = fdtd

		s.fhosts = {}
		for strf in s.strf_list:
			s.fhosts[strf] = np.random.rand(nx, ny, nz).astype(s.fdtd.dtype)
			cl.enqueue_write_buffer(s.fdtd.queue, s.fdtd.get_buffer(strf), s.fhosts[strf])
Пример #10
0
    def set_target(self, target):
        flags = mf.READ_ONLY | mf.COPY_HOST_PTR
        self.target = np.array(target, np.float32)

        if self.target_buffer is None:
            self.target_buffer = self.buffer(self.target)
        else:
            cl.enqueue_write_buffer(self.queue, self.target_buffer, self.target)
Пример #11
0
    def randomize_weights( self, context ):
        """
        Initialize weights of layer by random values
        """

        weights = numpy.random.rand( context._weights_buf_size ).astype( numpy.float32 )
        weights -= 0.5
        weights *= 4.0 / numpy.sqrt( numpy.float32( context._weights_buf_size / context._neurons_buf_size ) )

        pyopencl.enqueue_write_buffer( context.opencl.queue, context._weights_buf, weights, is_blocking = True )
Пример #12
0
 def set_weights( self, weights ):
     """
     Set weights for entire layer.
     
     @param weights
         NumPy.NDArray of float32 values, size equals to inputs_per_neuron * neuron_count
     """
     pyopencl.enqueue_write_buffer( 
         self.opencl.queue, self.context._weights_buf, weights,
         device_offset = int( self._weights_offset * 4 ), is_blocking = True
         )
Пример #13
0
	def __init__(self, baseTab):
		self.baseTab = baseTab

		self.ctx = cl.create_some_context()
		self.queue = cl.CommandQueue(self.ctx)
		f = open("gutarp.cl", 'r')
		fstr = "".join(f.readlines())
		self.guTarpCL = cl.Program(self.ctx, fstr).build()

		self.baseTabBuffer = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.baseTab)
		cl.enqueue_write_buffer(self.queue, self.baseTabBuffer, self.baseTab)
Пример #14
0
    def push_particles(self, pos, vel, color):
        nn = pos.shape[0]
        if self.num + nn > self.sph.max_num:
            return

        self.acquire_gl()
        cl.enqueue_write_buffer(self.queue, self.position_u, pos, self.num)
        self.release_gl()

        self.num += nn
        self.update_sphp()
        self.queue.finish()
Пример #15
0
        def test_process( self ):
            weights = numpy.random.rand( self.nnc._weights_buf_size ).astype( numpy.float32 )
            weights -= 0.5
            weights *= 4.0 / numpy.sqrt( numpy.float32( self.nnc._weights_buf_size / self.nnc._neurons_buf_size ) )

            pyopencl.enqueue_write_buffer( self.ocl.queue, self.nnc._weights_buf, weights, is_blocking = True )

            self.nnc.input_layer.set_inputs( numpy.array( [x * x for x in range( 0, 10 )], numpy.float32 ), is_blocking = True )
            self.nnc.input_layer.process()

            self.assertArrayEqual( self.i.get_outputs()[:3], self.h1.get_inputs() )
            self.assertArrayEqual( self.i.get_outputs()[:5], self.h2.get_inputs() )
            self.assertArrayEqual( self.i.get_outputs()[4:10], self.h3.get_inputs()[:6] )
def exchange_boundary(snx, ny, queues, f_gpus, tmp_hs, tmp_ts):
	ngpu = len(queues)

	for i, queue in enumerate(queues):
		if i>0: 
			cl.enqueue_read_buffer(queue, f_gpus[i], tmp_hs[i], device_offset=ny*4)
		if i<ngpu-1: 
			cl.enqueue_read_buffer(queue, f_gpus[i], tmp_ts[i], device_offset=(snx-2)*ny*4)

	for i, queue in enumerate(queues):
		if i>0: 
			cl.enqueue_write_buffer(queue, f_gpus[i], tmp_ts[i-1])
		if i<ngpu-1: 
			cl.enqueue_write_buffer(queue, f_gpus[i], tmp_hs[i+1], device_offset=(snx-1)*ny*4)
Пример #17
0
def sobel(im, cl=None):
    if cl is None:
        cl = setup(im)
    im = im.astype(numpy.float32)
    pyopencl.enqueue_write_buffer(cl['queue'], cl['im_dev'], im)
    cl['prg'].sobel(cl['queue'], im.shape, (3,), \
            cl['im_dev'], cl['m_dev'], cl['x_dev'], cl['y_dev'])
    m = numpy.empty_like(im)
    x = numpy.empty_like(im)
    y = numpy.empty_like(im)
    pyopencl.enqueue_read_buffer(cl['queue'], cl['m_dev'], m).wait()
    pyopencl.enqueue_read_buffer(cl['queue'], cl['x_dev'], x).wait()
    pyopencl.enqueue_read_buffer(cl['queue'], cl['y_dev'], y).wait()
    return m, x, y
Пример #18
0
	def compute(self, idTab) :

		idTabBuffer = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=idTab)
		cl.enqueue_write_buffer(self.queue, idTabBuffer, idTab)

		result = numpy.empty([idTab.shape[0], 1], dtype=numpy.int32)

		resultBuffer = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=result)
		cl.enqueue_write_buffer(self.queue, resultBuffer, result)

		self.guTarpCL.gutarp(self.queue, (idTab.shape[0],), None, self.baseTabBuffer, idTabBuffer,
								numpy.int32(self.baseTab.shape[0]), numpy.int32(self.baseTab.shape[1]), resultBuffer)

		cl.enqueue_read_buffer(self.queue, resultBuffer, result).wait()
		return result
Пример #19
0
def compute_raytrace(mat):
    assert mat.dtype == np.float32
    assert mat.shape == (4,4)
    mat = np.ascontiguousarray(mat)

    cl.enqueue_write_buffer(queue, mat_buf, mat)
    cl.enqueue_write_buffer(queue, inv_mat_buf, 
                            np.ascontiguousarray(np.linalg.inv(mat))).wait()
    
    evt = program.raytrace(queue, (H,W), None,
                           img_buf,
                           mat_buf, inv_mat_buf,
                           np.int32(num_faces), vert_buf, face_buf)
                           
    evt.wait()
    return evt
Пример #20
0
 def calc_radii_gpu(self):
   if self.openCLInitialized == 0:
     self.initialize_opencl()
   cl.enqueue_write_buffer(self.queue, self.ptcllist_d,
       self.ptcllist).wait()
   blockSize = 1024
   numBlocks = int(
       math.ceil(numpy.float64(self.nptcls) /
         numpy.float64(blockSize)))
   print(blockSize)
   print(numBlocks)
   self.prg.calc_radii_kernel(
       self.queue,
       (numBlocks * blockSize,), 
       (blockSize,),
       self.ptcllist_d,
       numpy.int32(self.nptcls),
       numpy.float64(self.k))
   cl.enqueue_read_buffer(self.queue, self.ptcllist_d, self.ptcllist).wait()
Пример #21
0
def load_mesh(vertices, faces):
    assert vertices.dtype == np.float32
    assert vertices.shape[1] == 4
    assert vertices.flags['C_CONTIGUOUS']
    assert faces.dtype == np.int32
    assert faces.shape[1] == 4
    assert faces.flags['C_CONTIGUOUS']


    global vert_buf, face_buf, num_faces
    num_faces = faces.shape[0]
    vert_buf = cl.Buffer(context, mf.READ_WRITE, vertices.shape[0]*4*4)
    face_buf = cl.Buffer(context, mf.READ_WRITE, faces.shape[0]*4*4)

    evt = cl.enqueue_write_buffer(queue, vert_buf, vertices, 
                                  is_blocking=True)
    evt = cl.enqueue_write_buffer(queue, face_buf, faces, 
                                  is_blocking=True)
    return evt
Пример #22
0
	def __init__(s, context, queue, nx, ny, nz, dtype=np.float32):
		s.context = context
		s.queue = queue
		s.nx = nx
		s.ny = ny
		s.nz = nz
		s.dtype = dtype

		mf = cl.mem_flags
		f = np.zeros((s.nx, s.ny, s.nz), dtype=s.dtype)
		cf = np.ones_like(f) * 0.5

		s.ehs = s.ex, s.ey, s.ez, s.hx, s.hy, s.hz = [cl.Buffer(s.context, mf.READ_WRITE, f.nbytes) for i in range(6)]
		s.ces = [cl.Buffer(s.context, mf.READ_ONLY, cf.nbytes) for i in range(3)]

		for eh in s.ehs: cl.enqueue_write_buffer(queue, eh, f) 
		for ce in s.ces: cl.enqueue_write_buffer(queue, ce, cf) 

		del f, cf
Пример #23
0
def add(a, b):
    a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
    b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
    dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

    test1 = struct.pack('ffffi', .5, 10., .1, .2, 3)
    print test1, len(test1), struct.calcsize('ffff')

    test_buf = cl.Buffer(ctx, mf.READ_ONLY, len(test1))
    cl.enqueue_write_buffer(queue, test_buf, test1).wait()
    
    global_size = a.shape
    local_size = None
    prg.sum(queue, global_size, local_size, a_buf, b_buf, dest_buf, test_buf)
    queue.finish()

    c = np.empty_like(a)
    cl.enqueue_read_buffer(queue, dest_buf, c).wait()
    return c
Пример #24
0
    def get_weights_direction_buf( self, context ):
        """
        Returns direction by which adjust weights.
        """
        context.opencl.kernel_calc_conjugate_gradient_beta( 
            context.opencl.queue, ( 64, ),
            context._gradient_buf,
            self.prev_gradient_buf,
            numpy.int32( context._weights_buf_size ),
            pyopencl.LocalMemory( 256 ),
            pyopencl.LocalMemory( 256 ),
            self.beta_buf,
            local_size = ( 64, )
            )

#        test1 = numpy.ndarray( [ context.weights_buf_size ], numpy.float32 )
#        pyopencl.enqueue_read_buffer( context.opencl.queue, context.gradient_buf, test1, is_blocking = True )
#        test2 = numpy.ndarray( [ context.weights_buf_size ], numpy.float32 )
#        pyopencl.enqueue_read_buffer( context.opencl.queue, self.prev_gradient_buf, test2, is_blocking = True )
#
#        beta = numpy.float32( ( test1 * ( test1 - test2 ) ).sum() / ( test2 * test2 ).sum() )
#        pyopencl.enqueue_write_buffer( context.opencl.queue, self.beta_buf, numpy.array( [beta], numpy.float32 ), is_blocking = True )
#
#        test = numpy.ndarray( [ context.weights_buf_size ], numpy.float32 )
#        pyopencl.enqueue_read_buffer( context.opencl.queue, self.beta_buf, test, is_blocking = True )

        self.iteration_count += 1
        if self.iteration_count > context.total_neurons:
            pyopencl.enqueue_write_buffer( context.opencl.queue, self.beta_buf, numpy.zeros( [1], numpy.float32 ), is_blocking = True )
            self.iteration_count = 0

        context.opencl.kernel_calc_conjugate_gradient_direction( 
            context.opencl.queue, ( int( context._weights_buf_size ), ),
            context._gradient_buf,
            self.beta_buf,
            self.direction_buf,
            self.prev_gradient_buf
            )

#        pyopencl.enqueue_read_buffer( context.opencl.queue, self.direction_buf, test, is_blocking = True )

        return self.direction_buf
Пример #25
0
def add(a, b):
    a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
    b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
    dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

    params = struct.pack('ffffi', .5, 10., 0., 0., 3)
    print len(params), struct.calcsize('ffffi')

    params_buf = cl.Buffer(ctx, mf.READ_ONLY, len(params))
    cl.enqueue_write_buffer(queue, params_buf, params).wait()

    global_size = a.shape
    local_size = None
    prg.part3(queue, global_size, local_size, a_buf, b_buf, dest_buf,
              params_buf)
    queue.finish()

    c = np.empty_like(a)
    cl.enqueue_read_buffer(queue, dest_buf, c).wait()
    return c
Пример #26
0
    def start(self):
        mat_shape = self.a_dev.shape

        rand_start_t = time()
        self.x_host = np.random.uniform(size=mat_shape[1]).astype(np.float32)
        self.rand_t = time()-rand_start_t

        self.x_wr_evt = cl.enqueue_write_buffer(
                self.queue, self.x_dev.data, self.x_host, is_blocking=False)
        self.mv_evt = self.mat_vec_knl(self.queue, (mat_shape[0],), (128,),
                self.a_dev.data, self.x_dev.data, self.b_dev.data, self.y_host_buf,
                mat_shape[1])
Пример #27
0
 def set_inputs( self, inputs, is_blocking = True, wait_for = None ):
     """
     Setup inputs to input layer.
     
     @param inputs
         NumPy.NDArray of float32 values, size equals to neuron count
     """
     return pyopencl.enqueue_write_buffer( 
         self.opencl.queue, self.context._inputs_buf, inputs,
         device_offset = int( self._inputs_offset * 4 ), is_blocking = is_blocking,
         wait_for = wait_for
         )
Пример #28
0
    def compute(self, recordNum, pattern, bucketIdx, actValue, learn, infer):
        """
        Computes 1 step
        :param recordNum:
        :param pattern: indices of active columns in the TM layer
        :param classification: dict of bucketIdx and actualValue
        :param learn:
        :param infer:
        :return:
        """
        pattern = np.array(pattern, dtype=cltypes.uint)
        if not self._init_buffers:
            self._setup_buffers(pattern)

        ev_copy_pattern = cl.enqueue_write_buffer(self._queue, self.cl_activeBitIdx, pattern)
        # update bit activations on device side
        ev_update_bit = self._prg.update_bit_activations(self._queue, (pattern.size,), None,
                                                         self.cl_bit_activations, self.cl_activeBitIdx,
                                                         wait_for=[ev_copy_pattern])

        multiStepPredictions = {}
        ev_learn = None
        if learn:
            ev_learn = [self._prg.learn(self._queue, (self.step_count * pattern.size,), None,
                                        self.cl_activeBitIdx, self.cl_table_average, self.cl_table_counts, self.alpha,
                                        self.actValueAlpha,
                                        cltypes.uint(bucketIdx), self._numBuckets,
                                        wait_for=[ev_update_bit])]
        if infer:
            """
                const __global float* averages,
                const __global uint* counts,
                const __global uint* activeBitIdx,
                __global float2* predictions, // the array of predictions
                __global const  uint* bitActivations, // the number of times each bit has been active
                uint const activeBits
            """
            # kernel for every active bit in each step
            ev_infer = self._prg.infer(self._queue, (self._numBuckets,), None,
                                       self.cl_table_average, self.cl_table_counts, self.cl_activeBitIdx,
                                       self.cl_predictions, self.cl_bit_activations,
                                       cltypes.uint(pattern.size), wait_for=ev_learn)

            cl.enqueue_copy(self._queue, self._predictions, self.cl_predictions, wait_for=[ev_infer]).wait()
        # print("Activations", self.bucket_activations)
        # multiStepPredictions['actualValues'] = predictions['x'] / len(pattern)
        # multiStepPredictions[step] = predictions['y'] / len(pattern)  # the probability for each bucket
        # print("Actual Values", multiStepPredictions['actualValues'])
        multiStepPredictions[1] = self._predictions.copy()
        # print("Probability", multiStepPredictions[1])
        self.bucket_activations[bucketIdx] += 1

        return multiStepPredictions
Пример #29
0
    def FuseRGBD_GPU(self, Image, boneDQ, jointDQ):
        """
        Update the TSDF volume with Image
        :param Image: RGBD image to update to its surfaces
        :param boneDQ: the dual quaternion of bone in new frame
        :param jointDQ: the dual quaternion of joint in new frame
        :param bp: the indexof body part
        :return: none
        """
        # initialize buffers
        #cl.enqueue_write_buffer(self.GPUManager.queue, self.Pose_GPU, Tg)
        cl.enqueue_write_buffer(self.GPUManager.queue, self.DepthGPU,
                                Image.depth_image)
        cl.enqueue_write_buffer(self.GPUManager.queue, self.boneDQGPU, boneDQ)
        cl.enqueue_write_buffer(self.GPUManager.queue, self.jointDQGPU,
                                jointDQ)

        # fuse data of the RGBD imnage with the TSDF volume 3D model
        self.GPUManager.programs['FuseTSDF'].FuseTSDF(self.GPUManager.queue, (self.Size[0], self.Size[1]), None, \
                                self.TSDFGPU, self.DepthGPU, self.Param, self.Size_Volume, self.Pose_GPU, \
                                self.boneDQGPU, self.jointDQGPU, self.planeF,\
                                self.Calib_GPU, np.int32(Image.Size[0]), np.int32(Image.Size[1]),self.WeightGPU)

        # update CPU array. Read the buffer to write in the CPU array.
        cl.enqueue_read_buffer(self.GPUManager.queue, self.TSDFGPU,
                               self.TSDF).wait()
        '''
        # TEST if TSDF contains NaN
        TSDFNaN = np.count_nonzero(np.isnan(self.TSDF))
        print "TSDFNaN : %d" %(TSDFNaN)
        '''
        cl.enqueue_read_buffer(self.GPUManager.queue, self.WeightGPU,
                               self.Weight).wait()
Пример #30
0
	def setBuffers(self, A,B,x0):
		""" Create/set OpenCL required buffers.
		@param A Linear system matrix.
		@param B Independent linear term.
		@param x0 Initial solution estimator.
		"""
		# Get dimensions
		shape = np.shape(A)
		if len(shape) != 2:
			raise ValueError, 'Matrix A must be 2 dimensional array'
		if shape[0] != shape[1]:
			raise ValueError, 'Square linear system matrix expected'
		if len(B) != shape[0]:
			raise ValueError, 'Matrix and independet term dimensions does not match'
		n = len(B)
		# Set x0 if not provided
		if x0 != None:
			if len(x0) != n:
				raise ValueError, 'Initial solution estimator length does not match with linear system dimensions'
		if x0 == None:
			x0 = B
		# Create OpenCL objects if not already generated
		if not self.A:
			mf = cl.mem_flags
			self.A      = cl.Buffer( self.context, mf.READ_ONLY,  size = n*n * np.dtype('float32').itemsize )
			self.B      = cl_array.zeros(self.context,self.queue, (n), np.float32)
			self.X0     = cl_array.zeros(self.context,self.queue, (n), np.float32)
			self.X      = cl_array.zeros(self.context,self.queue, (n), np.float32)
			self.R      = cl_array.zeros(self.context,self.queue, (n), np.float32)
			self.x      = np.zeros((n), dtype=np.float32)
			self.n      = n
		# Transfer data to buffers
		events = []
		events.append(cl.enqueue_write_buffer(self.queue, self.A,  A.reshape((n*n)) ))
		events.append(cl.enqueue_write_buffer(self.queue, self.B.data,  B))
		events.append(cl.enqueue_write_buffer(self.queue, self.X0.data, x0))
		for e in events:
			e.wait()
Пример #31
0
    def compute(self, batch):
        "compute best path for each batch element. Returns blank-terminated label strings for batch elements."

        # measure time in GPU debug mode
        if self.enableGPUDebug:
            t0 = time.time()

        # copy batch to device
        cl.enqueue_write_buffer(self.queue,
                                self.batchBuf,
                                batch.astype(np.float32),
                                is_blocking=False)

        # one pass
        if self.kernelVariant == 1:
            cl.enqueue_nd_range_kernel(self.queue, self.kernel1,
                                       (self.batchSize, self.maxT),
                                       (1, self.maxT))
        # two passes
        else:
            cl.enqueue_nd_range_kernel(self.queue, self.kernel1,
                                       (self.batchSize, self.maxT, self.maxC),
                                       (1, 1, self.maxC))
            cl.enqueue_nd_range_kernel(self.queue, self.kernel2,
                                       (self.batchSize, ), None)

        # copy result back from GPU and return it
        cl.enqueue_read_buffer(self.queue,
                               self.resBuf,
                               self.res,
                               is_blocking=True)

        # measure time in GPU debug mode
        if self.enableGPUDebug:
            t1 = time.time()
            print('BestPathCL.compute(...) time: ', t1 - t0)

        return self.res
Пример #32
0
 def set_inputs(self, inputs, is_blocking=True, wait_for=None):
     """
     Setup inputs to input layer.
     
     @param inputs
         NumPy.NDArray of float32 values, size equals to neuron count
     """
     return pyopencl.enqueue_write_buffer(self.opencl.queue,
                                          self.context._inputs_buf,
                                          inputs,
                                          device_offset=int(
                                              self._inputs_offset * 4),
                                          is_blocking=is_blocking,
                                          wait_for=wait_for)
Пример #33
0
    def start(self):
        mat_shape = self.a_dev.shape

        rand_start_t = time()
        self.x_host = np.random.uniform(size=mat_shape[1]).astype(np.float32)
        self.rand_t = time() - rand_start_t

        self.x_wr_evt = cl.enqueue_write_buffer(self.queue,
                                                self.x_dev.data,
                                                self.x_host,
                                                is_blocking=False)
        self.mv_evt = self.mat_vec_knl(self.queue, (mat_shape[0], ), (128, ),
                                       self.a_dev.data, self.x_dev.data,
                                       self.b_dev.data, self.y_host_buf,
                                       mat_shape[1])
Пример #34
0
 def work():
     while flag[0]:
         
         work = work_getter()
         if work is None:
             print "Worker starved!"
             yield sleep(1)
             continue
         
         data, i = work
         
         self.kernel.search(
             self.commandQueue, (data.size, ), (self.WORKSIZE, ),
             data.state[0], data.state[1], data.state[2], data.state[3],
             data.state[4], data.state[5], data.state[6], data.state[7],
             data.state2[1], data.state2[2], data.state2[3],
             data.state2[5], data.state2[6], data.state2[7],
             data.base[i],
             data.f[0], data.f[1], data.f[2], data.f[3],
             data.f[4], data.f[5], data.f[6], data.f[7],
             self.output_buf)
         cl.enqueue_read_buffer(
             self.commandQueue, self.output_buf, self.output)
         #self.commandQueue.finish()
         yield threads.deferToThread(self.commandQueue.finish)
         
         # The OpenCL code will flag the last item in the output buffer
         # when it finds a valid nonce. If that's the case, send it to
         # the main thread for postprocessing and clean the buffer
         # for the next pass.
         if self.output[self.OUTPUT_SIZE]:
             self.postprocess(self.output.copy(), data.nr, solution_putter)
             
             self.output.fill(0)
             cl.enqueue_write_buffer(
                 self.commandQueue, self.output_buf, self.output)
Пример #35
0
        def test_process(self):
            weights = numpy.random.rand(self.nnc._weights_buf_size).astype(
                numpy.float32)
            weights -= 0.5
            weights *= 4.0 / numpy.sqrt(
                numpy.float32(
                    self.nnc._weights_buf_size / self.nnc._neurons_buf_size))

            pyopencl.enqueue_write_buffer(self.ocl.queue,
                                          self.nnc._weights_buf,
                                          weights,
                                          is_blocking=True)

            self.nnc.input_layer.set_inputs(numpy.array(
                [x * x for x in range(0, 10)], numpy.float32),
                                            is_blocking=True)
            self.nnc.input_layer.process()

            self.assertArrayEqual(self.i.get_outputs()[:3],
                                  self.h1.get_inputs())
            self.assertArrayEqual(self.i.get_outputs()[:5],
                                  self.h2.get_inputs())
            self.assertArrayEqual(self.i.get_outputs()[4:10],
                                  self.h3.get_inputs()[:6])
Пример #36
0
    def search(self, midstate):
        msg = flipendian32(midstate)

        for i in xrange(8):
            self.sha512_fill.set_arg(i, msg[i * 4:i * 4 + 4])
        self.sha512_fill.set_arg(8, self.hashes_buf)
        self.sha512_fill.set_arg(9, self.keyhash_buf)
        # t1 = time.time()
        cl.enqueue_nd_range_kernel(self.queue, self.sha512_fill, (HASHES_NUM,), (self.sha512_fill_ws,))
        self.queue.finish()
        # print "fill %f" % (time.time() - t1)

        output = bytearray(OUTPUT_SIZE)
        cl.enqueue_write_buffer(self.queue, self.output_buf, output)
        self.queue.finish()

        self.ksearch.set_arg(0, self.hashes_buf)
        self.ksearch.set_arg(1, self.keyhash_buf)
        self.ksearch.set_arg(2, self.output_buf)
        cl.enqueue_nd_range_kernel(self.queue, self.ksearch, (KEYS_NUM,), (self.ksearch_ws,))
        self.queue.finish()
        cl.enqueue_read_buffer(self.queue, self.output_buf, output)
        self.queue.finish()
        return str(output)
Пример #37
0
        def work():
            while flag[0]:

                work = work_getter()
                if work is None:
                    print "Worker starved!"
                    yield sleep(1)
                    continue

                data, i = work

                self.kernel.search(
                    self.commandQueue, (data.size, ), (self.WORKSIZE, ),
                    data.state[0], data.state[1], data.state[2], data.state[3],
                    data.state[4], data.state[5], data.state[6], data.state[7],
                    data.state2[1], data.state2[2], data.state2[3],
                    data.state2[5], data.state2[6], data.state2[7],
                    data.base[i], data.f[0], data.f[1], data.f[2], data.f[3],
                    data.f[4], data.f[5], data.f[6], data.f[7],
                    self.output_buf)
                cl.enqueue_read_buffer(self.commandQueue, self.output_buf,
                                       self.output)
                #self.commandQueue.finish()
                yield threads.deferToThread(self.commandQueue.finish)

                # The OpenCL code will flag the last item in the output buffer
                # when it finds a valid nonce. If that's the case, send it to
                # the main thread for postprocessing and clean the buffer
                # for the next pass.
                if self.output[self.OUTPUT_SIZE]:
                    self.postprocess(self.output.copy(), data.nr,
                                     solution_putter)

                    self.output.fill(0)
                    cl.enqueue_write_buffer(self.commandQueue, self.output_buf,
                                            self.output)
Пример #38
0
 def to_buf(self, cl_buf, source=None):
     if source is None:
         if cl_buf in self.buffers:
             cl.enqueue_write_buffer(self.default_queue, cl_buf, self.buffers[cl_buf]).wait()
         else:
             raise ValueError("Unknown compute buffer and source not specified.")
     else:
         if source.base is not None:
             cl.enqueue_write_buffer(self.default_queue, cl_buf, source.base).wait()
         else:
             cl.enqueue_write_buffer(self.default_queue, cl_buf, source).wait()
Пример #39
0
	def setBuffers(self, fs, waves, sea, bem, body):
		""" Create/set OpenCL required buffers.
		@param fs Free surface instance.
		@param waves Waves instance.
		@param sea Sea boundary instance.
		@param bem Boundary Element Method instance.
		@param body Body instance.
		"""
		# Get dimensions
		nFS	= fs['N']
		nB	= body['N']
		n   = nFS + nB
		nW  = waves['N']
		# Generate arrays for positions, areas and normals
		pos    = np.zeros((n, 4), dtype=np.float32)
		area   = np.zeros((n   ), dtype=np.float32)
		normal = np.zeros((n, 4), dtype=np.float32)
		p      = np.zeros((n   ), dtype=np.float32)
		dp     = np.zeros((n   ), dtype=np.float32)
		w      = np.zeros((nW,4), dtype=np.float32)
		pos[0:nFS]    = fs['pos'].reshape((nFS,4))
		area[0:nFS]   = fs['area'].reshape((nFS))
		normal[0:nFS] = fs['normal'].reshape((nFS,4))
		nx	          = fs['Nx']
		ny	          = fs['Ny']
		p[0:n]        = bem['p']
		dp[0:n]       = bem['gradp']
		w[0:nW]       = waves['data']
		# Create OpenCL objects if not already generated
		if not self.A:
			mf = cl.mem_flags
			self.A      = cl.Buffer( self.context, mf.WRITE_ONLY, size = n*n  * np.dtype('float32').itemsize )
			self.B      = cl.Buffer( self.context, mf.WRITE_ONLY, size = n    * np.dtype('float32').itemsize )
			self.dB     = cl.Buffer( self.context, mf.WRITE_ONLY, size = n    * np.dtype('float32').itemsize )
			self.pos    = cl.Buffer( self.context, mf.READ_ONLY,  size = n*4  * np.dtype('float32').itemsize )
			self.area   = cl.Buffer( self.context, mf.READ_ONLY,  size = n    * np.dtype('float32').itemsize )
			self.normal = cl.Buffer( self.context, mf.READ_ONLY,  size = n*4  * np.dtype('float32').itemsize )
			self.bem_p  = cl.Buffer( self.context, mf.READ_ONLY,  size = n    * np.dtype('float32').itemsize )
			self.bem_dp = cl.Buffer( self.context, mf.READ_ONLY,  size = n    * np.dtype('float32').itemsize )
			self.waves  = cl.Buffer( self.context, mf.READ_ONLY,  size = nW*4 * np.dtype('float32').itemsize )
		# Transfer data to buffers
		events = []
		events.append(cl.enqueue_write_buffer(self.queue, self.pos,    pos))
		events.append(cl.enqueue_write_buffer(self.queue, self.area,   area))
		events.append(cl.enqueue_write_buffer(self.queue, self.normal, normal))
		events.append(cl.enqueue_write_buffer(self.queue, self.bem_p,  p))
		events.append(cl.enqueue_write_buffer(self.queue, self.bem_dp, dp))
		events.append(cl.enqueue_write_buffer(self.queue, self.waves,  w))
		for e in events:
			e.wait()
Пример #40
0
 def to_buf(self, cl_buf, source=None):
     if source is None:
         if cl_buf in self.buffers:
             cl.enqueue_write_buffer(self.default_queue, cl_buf,
                                     self.buffers[cl_buf]).wait()
         else:
             raise ValueError(
                 'Unknown compute buffer and source not specified.')
     else:
         if source.base is not None:
             cl.enqueue_write_buffer(self.default_queue, cl_buf,
                                     source.base).wait()
         else:
             cl.enqueue_write_buffer(self.default_queue, cl_buf,
                                     source).wait()
Пример #41
0
    def FuseRGBD_GPU(self, depth_image, depth_intrinsic, cam_pose, nu):
        """
        Update the TSDF volume with Image
        :param Image: RGBD image to update to its surfaces
        """
        # initialize buffers
        #oneweights = np.ones((self.Size[2],self.Size[1]), dtype=np.int16)
        self.nu = nu
        #self.TSDF = TSDF
        #self.Weight = Weight
        cl.enqueue_write_buffer(self.GPUManager.queue, self.DepthGPU,
                                depth_image)
        #now oneweights ise 1leri attim ilerde buu skinning weightleri ile dene
        #cl.enqueue_write_buffer(self.GPUManager.queue, self.WeightGPU, oneweights).wait()

        #cl.enqueue_write_buffer(self.GPUManager.queue, self.VertexGPU, self.VertexArray).wait()
        #cl.enqueue_write_buffer(self.GPUManager.queue, self.boneDQGPU, boneDQ)
        #cl.enqueue_write_buffer(self.GPUManager.queue, self.jointDQGPU, jointDQ)

        #print(np.int32(depth_image.shape[1])) simdi ravel gonderiyorum bu yuzden boyle
        # fuse data of the RGBD imnage with the TSDF volume 3D model
        print("==========DEPTH=fuseRGBD========")
        print(depth_image.shape[0])
        print(depth_image.shape[1])
        print(self.Size[1])
        print(self.Size[2])
        print("=======================")

        #self.GPUManager.programs['FuseTSDF'].FuseTSDF(self.GPUManager.queue, (self.Size[1], self.Size[2]) , None, \
        #                        self.TSDFGPU, self.WeightGPU,self.DistGPU,self.DepthGPU, self.Pose_GPU, \
        #                        self.VoxelGPU,self.VertexGPU, \
        #                        self.Calib_GPU, np.int32(depth_shape[0]), np.int32(depth_shape[1]),\
        #                        )
        '''
        self.GPUManager.programs['FuseSortedTSDF'].FuseSortedTSDF(self.GPUManager.queue, (self.Size[2], self.Size[1]) , None, \
                        self.TSDFGPU, self.WeightGPU,self.DistGPU,self.DepthGPU, \
                        self.voxelvertexArrayGPU, np.int32(self.nu), \
                        self.Calib_GPU, np.int32(depth_shape[0]), np.int32(depth_shape[1])\
                        )
        
        #GPU for Sphere Equation
        #self.GPUManager.programs['FuseEqTSDF'].FuseEqTSDF(self.GPUManager.queue, (self.Size[1], self.Size[2]), None, \
        #                        self.TSDFGPU, self.TSDFtableGPU,self.Size_Volume, self.Pose_GPU, \
        #                        self.VoxelGPU,self.VertexGPU, \
        #                        np.int32(depth_shape[0]), np.int32(depth_shape[1]))


        # update CPU array. Read the buffer to write in the CPU array.
        cl.enqueue_read_buffer(self.GPUManager.queue, self.TSDFGPU, self.TSDF).wait()
        cl.enqueue_read_buffer(self.GPUManager.queue, self.DistGPU, self.Dist).wait()
        cl.enqueue_read_buffer(self.GPUManager.queue, self.WeightGPU, self.Weight).wait()
        #cl.enqueue_read_buffer(self.GPUManager.queue, self.DepthGPU, self.depth).wait()
        #cl.enqueue_read_buffer(self.GPUManager.queue, self.VoxelGPU, self.voxeldogrumu).wait()
        '''

        #FOR CPU calculation
        # self.calculateTSDF(depth_image, depth_intrinsic, cam_pose)
        if (self.maximum_mode):
            self.calculateTSDF_absminmode(depth_image, depth_intrinsic,
                                          cam_pose)
        else:
            self.calculateTSDF_ver2(depth_image, depth_intrinsic, cam_pose)
        # np.savetxt(savepath + '/FuseRGBD_CGPU.txt', self.TSDF_vertices, delimiter=',')
        # np.savetxt(savepath + '/FuseRGBD_CDist.txt', self.Dist, delimiter=',')

        #from Sphere Equation
        #self.TSDFeq()

        # np.savetxt(savepath + '/FuseRGBD_CGPU.txt', self.TSDF, delimiter=',')
        # np.savetxt(savepath + '/FuseRGBD_CDist.txt', self.Dist, delimiter=',')
        #np.savetxt('FuseRGBD_WeightGG.txt', self.Weight, delimiter=',')
        #np.savetxt('FuseRGBD_cokyanlis.txt', self.depth, delimiter=',')

        # TEST if TSDF contains NaN
        TSDFNaN = np.count_nonzero(np.isnan(self.TSDF_vertices))
        print("TSDFNaN : {0:d}".format(TSDFNaN))

        return self.TSDF_vertices, self.Weights_vertices
Пример #42
0
    def gpu_bench(self, N):
        ctx = cl.Context(devices=[self.dev])
        queue = cl.CommandQueue(ctx)
        mf = cl.mem_flags

        U = numpy.zeros((N, N), numpy.float32)
        V = numpy.zeros((N, N), numpy.float32)

        U[0, ::2] = 1.0
        V[0, 1::2] = 1.0

        U_buf = cl.Buffer(ctx, mf.READ_WRITE, U.nbytes)
        V_buf = cl.Buffer(ctx, mf.READ_WRITE, V.nbytes)

        cl.enqueue_write_buffer(queue, U_buf, U).wait()
        cl.enqueue_write_buffer(queue, V_buf, V).wait()

        knl1 = """
__kernel void update(__global float *u, __global float *v)
{
    int i = get_global_id(0) + 1;
    int j = get_global_id(1) + 1;
    int ny = %(NY)d;
    
    v[ny*j + i] = ((u[ny*(i-1) + j] + u[ny*(i+1) + j]) +
                                   (u[ny*i + j-1] + u[ny*i + j+1]))*0.25;
    
    }
    """ % {
            'NY': N,
            'NX': N
        }

        knl2 = """
__kernel void update(__global float *u, __global float *v)
{
    int i = get_global_id(0);
    int j = get_global_id(1);
    int x = get_local_id(0)+1;
    int y = get_local_id(1)+1;
    int lsize = %(lsize)d;
    int ny = %(NY)d;
    __local float tile[324];
    float sum = 0.0f;
    
    tile[lsize*y + x] = u[ny*j + i];
    
    barrier(CLK_LOCAL_MEM_FENCE);
    
    sum += tile[lsize*(y-1) + x];
    sum += tile[lsize*(y+1) + x];
    sum += tile[lsize*y + x + 1];
    sum += tile[lsize*y + x - 1];
    sum *= 0.25f;
    
    v[ny*j + i] = sum;
    
    }
    """ % {
            'NY': N,
            'NX': N,
            'lsize': 18
        }

        prg = cl.Program(ctx, knl2)
        prg.build()
        yield 0
        reps = 100 if N <= 1000 else 10
        ct = 0
        while True:
            for i in xrange(reps):
                evt = prg.update(queue, ((N), (N)), (16, 16),
                                 U_buf,
                                 V_buf,
                                 g_times_l=False)
                evt = prg.update(queue, ((N), (N)), (16, 16),
                                 V_buf,
                                 U_buf,
                                 wait_for=[evt],
                                 g_times_l=False)
                ct += 1
            queue.finish()
            yield ct
Пример #43
0
	def mining_thread(self):
		say_line('started OpenCL miner on platform %d, device %d (%s)', (self.options.platform, self.device_index, self.device_name))

		(self.defines, rate_divisor, hashspace) = if_else(self.vectors, ('-DVECTORS', 500, 0x7FFFFFFF), ('', 1000, 0xFFFFFFFF))
		self.defines += (' -DOUTPUT_SIZE=' + str(self.output_size))
		self.defines += (' -DOUTPUT_MASK=' + str(self.output_size - 1))

		self.load_kernel()
		frame = 1.0 / max(self.frames, 3)
		unit = self.worksize * 256
		global_threads = unit * 10

		queue = cl.CommandQueue(self.context)

		last_rated_pace = last_rated = last_n_time = last_temperature = time()
		base = last_hash_rate = threads_run_pace = threads_run = 0
		output = np.zeros(self.output_size + 1, np.uint32)
		output_buffer = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.USE_HOST_PTR, hostbuf=output)
		self.kernel.set_arg(20, output_buffer)

		work = None
		temperature = 0
		while True:
			if self.should_stop: return

			sleep(self.frameSleep)

			if (not work) or (not self.work_queue.empty()):
				try:
					work = self.work_queue.get(True, 1)
				except Empty: continue
				else:
					if not work: continue
					nonces_left = hashspace
					state = work.state
					state2 = work.state2
					f = work.f

					self.kernel.set_arg(0, state[0])
					self.kernel.set_arg(1, state[1])
					self.kernel.set_arg(2, state[2])
					self.kernel.set_arg(3, state[3])
					self.kernel.set_arg(4, state[4])
					self.kernel.set_arg(5, state[5])
					self.kernel.set_arg(6, state[6])
					self.kernel.set_arg(7, state[7])

					self.kernel.set_arg(8, state2[1])
					self.kernel.set_arg(9, state2[2])
					self.kernel.set_arg(10, state2[3])
					self.kernel.set_arg(11, state2[5])
					self.kernel.set_arg(12, state2[6])
					self.kernel.set_arg(13, state2[7])

					self.kernel.set_arg(15, f[0])
					self.kernel.set_arg(16, f[1])
					self.kernel.set_arg(17, f[2])
					self.kernel.set_arg(18, f[3])
					self.kernel.set_arg(19, f[4])

			if temperature < self.cutoff_temp:
				self.kernel.set_arg(14, pack('I', base))
				cl.enqueue_nd_range_kernel(queue, self.kernel, (global_threads,), (self.worksize,))

				nonces_left -= global_threads
				threads_run_pace += global_threads
				threads_run += global_threads
				base = uint32(base + global_threads)
			else:
				threads_run_pace = 0
				last_rated_pace = time()
				sleep(self.cutoff_interval)

			now = time()
			if self.adapterIndex != None:
				t = now - last_temperature
				if temperature >= self.cutoff_temp or t > 1:
					last_temperature = now
					with adl_lock:
						temperature = self.get_temperature()

			t = now - last_rated_pace
			if t > 1:
				rate = (threads_run_pace / t) / rate_divisor
				last_rated_pace = now; threads_run_pace = 0
				r = last_hash_rate / rate
				if r < 0.9 or r > 1.1:
					global_threads = max(unit * int((rate * frame * rate_divisor) / unit), unit)
					last_hash_rate = rate

			t = now - last_rated
			if t > self.options.rate:
				self.update_rate(now, threads_run, t, work.targetQ, rate_divisor)
				last_rated = now; threads_run = 0

			queue.finish()
			cl.enqueue_read_buffer(queue, output_buffer, output)
			queue.finish()

			if output[self.output_size]:
				result = Object()
				result.header = work.header
				result.merkle_end = work.merkle_end
				result.time = work.time
				result.difficulty = work.difficulty
				result.target = work.target
				result.state = np.array(state)
				result.nonces = np.array(output)
				result.job_id = work.job_id
				result.extranonce2 = work.extranonce2
				result.server = work.server
				result.miner = self
				self.switch.put(result)
				output.fill(0)
				cl.enqueue_write_buffer(queue, output_buffer, output)

			if not self.switch.update_time:
				if nonces_left < 3 * global_threads * self.frames:
					self.update = True
					nonces_left += 0xFFFFFFFFFFFF
				elif 0xFFFFFFFFFFF < nonces_left < 0xFFFFFFFFFFFF:
					say_line('warning: job finished, %s is idle', self.id()) 
					work = None
			elif now - last_n_time > 1:
				work.time = bytereverse(bytereverse(work.time) + 1)
				state2 = partial(state, work.merkle_end, work.time, work.difficulty, f)
				calculateF(state, work.merkle_end, work.time, work.difficulty, f, state2)
				self.kernel.set_arg(8, state2[1])
				self.kernel.set_arg(9, state2[2])
				self.kernel.set_arg(10, state2[3])
				self.kernel.set_arg(11, state2[5])
				self.kernel.set_arg(12, state2[6])
				self.kernel.set_arg(13, state2[7])
				self.kernel.set_arg(15, f[0])
				self.kernel.set_arg(16, f[1])
				self.kernel.set_arg(17, f[2])
				self.kernel.set_arg(18, f[3])
				self.kernel.set_arg(19, f[4])
				last_n_time = now
				self.update_time_counter += 1
				if self.update_time_counter >= self.switch.max_update_time:
					self.update = True
					self.update_time_counter = 1
Пример #44
0
 def load_images(self, rgb, depth):
     cl.enqueue_write_buffer(self.queue, self.rgb_cl, rgb)
     cl.enqueue_write_buffer(self.queue, self.depth_cl, depth)
     self.queue.finish()
Пример #45
0
class Array(object):
    """A :mod:`pyopencl` Array is used to do array-based calculation on
    a compute device.

    This is mostly supposed to be a :mod:`numpy`-workalike. Operators
    work on an element-by-element basis, just like :class:`numpy.ndarray`.
    """

    def __init__(self, context, shape, dtype, order="C", allocator=None,
            base=None, data=None, queue=None):
        if allocator is None:
            allocator = DefaultAllocator(context)

        try:
            s = 1
            for dim in shape:
                s *= dim
        except TypeError:
            if not isinstance(shape, int):
                raise TypeError("shape must either be iterable or "
                        "castable to an integer")
            s = shape
            shape = (shape,)

        self.context = context
        self.queue = queue

        self.shape = shape
        self.dtype = numpy.dtype(dtype)
        if order not in ["C", "F"]:
            raise ValueError("order must be either 'C' or 'F'")
        self.order = order

        self.mem_size = self.size = s
        self.nbytes = self.dtype.itemsize * self.size

        self.allocator = allocator
        if data is None:
            if self.size:
                self.data = self.allocator(self.size * self.dtype.itemsize)
            else:
                self.data = None

            if base is not None:
                raise ValueError("If data is specified, base must be None.")
        else:
            self.data = data

        self.base = base

    #@memoize_method FIXME: reenable
    def get_sizes(self, queue):
        return splay(queue, self.mem_size)

    def set(self, ary, queue=None, async=False):
        assert ary.size == self.size
        assert ary.dtype == self.dtype
        if self.size:
            evt = cl.enqueue_write_buffer(queue or self.queue, self.data, ary)

            if not async:
                evt.wait()
Пример #46
0
def load_raw(depth):
    (L, T), (R, B) = rect
    assert depth.dtype == np.float32
    assert depth.shape[0] == B - T
    assert depth.shape[1] == R - L
    return cl.enqueue_write_buffer(queue, raw_buf, depth, is_blocking=False)
Пример #47
0
def load_mask(mask):
    (L, T), (R, B) = rect
    assert mask.dtype == np.uint8
    assert mask.shape[0] == B - T
    assert mask.shape[1] == R - L
    return cl.enqueue_write_buffer(queue, mask_buf, mask, is_blocking=False)
Пример #48
0
    #print f2
    #print ftime[f1]
    #print ftime[f2]

    # read new frames and push to kernel
    # note since the data in f2 becomes f1 we do not need to push two frames
    # every time we go to a new interval.  By using an additional flag we can
    # set the linear interpolation in the kernel to be correct whether or not
    # f1 is behind or in front of f2
    if (f1 != flast):
        flast = f1
        uf1 = fin.var('u')[f1, :]
        vf1 = fin.var('v')[f1, :]
        uf2 = fin.var('u')[f2, :]
        vf2 = fin.var('v')[f2, :]
        cl.enqueue_write_buffer(a_queue, uf1_buf, uf1).wait()
        cl.enqueue_write_buffer(a_queue, vf1_buf, vf1).wait()
        cl.enqueue_write_buffer(a_queue, uf2_buf, uf2).wait()
        cl.enqueue_write_buffer(a_queue, vf2_buf, vf2).wait()

#---------------------------------------------------------------------------
    # find cell containing particle and update state
    #---------------------------------------------------------------------------
    event = findcell_knl(a_queue, x.shape, None, cell_buf, neney_buf, eney_buf,
                         x_buf, y_buf, xt_buf, yt_buf)

    #cl.enqueue_read_buffer(a_queue, cell_buf, cell).wait()

    event = findrobust_knl(a_queue, x.shape, None, cell_buf, x_buf, y_buf,
                           xt_buf, yt_buf, num_elems)
    def run_with_ocl(np_in_list,
                     np_out_list,
                     shape,
                     oclfun,
                     preset_outbuf=False,
                     rw_outbuf=False):
        assert (DEFAULT_ENABLE_OPENCL == True and OCL.ENABLE_OPENCL == True)
        t_start = time.time()

        buf_in_list = [
            cl.Buffer(OCL.prog['ctx'],
                      OCL.prog['mf'].READ_ONLY | OCL.prog['mf'].COPY_HOST_PTR,
                      size=np_in.nbytes,
                      hostbuf=np_in) for np_in in np_in_list
        ]

        if preset_outbuf:
            buf_out_list = [
                cl.Buffer(OCL.prog['ctx'],
                          (OCL.prog['mf'].READ_WRITE
                           if rw_outbuf else OCL.prog['mf'].WRITE_ONLY)
                          | OCL.prog['mf'].USE_HOST_PTR,
                          size=np_out.nbytes,
                          hostbuf=np_out) for np_out in np_out_list
            ]

            for i, outbuf in enumerate(buf_out_list):
                cl.enqueue_write_buffer(OCL.prog['queue'], outbuf,
                                        np_out_list[i])
        else:
            buf_out_list = [
                cl.Buffer(OCL.prog['ctx'],
                          (OCL.prog['mf'].READ_WRITE
                           if rw_outbuf else OCL.prog['mf'].WRITE_ONLY)
                          | OCL.prog['mf'].ALLOC_HOST_PTR,
                          size=np_out.nbytes) for np_out in np_out_list
            ]

        runevent = oclfun(
            OCL.prog['queue'], shape, None,
            *(buf_in_list +
              buf_out_list))  # queue, globalSize, localSize, *buffers)
        runevent.wait()

        for buf_from, np_to in zip(buf_out_list, np_out_list):
            # cl.enqueue_copy(OCL.prog['queue'], np_to, buf_from)

            cl.enqueue_read_buffer(OCL.prog['queue'], buf_from, np_to)

            # np_to1, e = cl.enqueue_map_buffer(
            #     OCL.prog['queue'],
            #     buf_from,
            #     cl.map_flags.READ,
            #     0,
            #     np_to.shape,
            #     np_to.dtype,
            #     "C"
            # )
            # np_to += np_to1

        OCL.prog['queue'].finish()

        t_end = time.time()
        dt_real = t_end - t_start
Пример #50
0
    def miningThread(self):
        self.loadKernel()
        frame = 1.0 / self.frames
        unit = self.worksize * 256
        globalThreads = unit * 10

        queue = cl.CommandQueue(self.context)

        lastRatedPace = lastRated = lastNTime = time()
        base = lastHashRate = threadsRunPace = threadsRun = 0
        f = np.zeros(8, np.uint32)
        output = np.zeros(OUTPUT_SIZE + 1, np.uint32)
        output_buf = cl.Buffer(self.context,
                               cl.mem_flags.WRITE_ONLY
                               | cl.mem_flags.USE_HOST_PTR,
                               hostbuf=output)

        work = None
        while True:
            sleep(self.frameSleep)
            if self.stop: return
            if (not work) or (not self.workQueue.empty()):
                try:
                    work = self.workQueue.get(True, 1)
                except Empty:
                    continue
                else:
                    if not work: continue

                    noncesLeft = self.hashspace
                    data = np.array(unpack('IIIIIIIIIIIIIIII',
                                           work['data'][128:].decode('hex')),
                                    dtype=np.uint32)
                    state = np.array(unpack('IIIIIIII',
                                            work['midstate'].decode('hex')),
                                     dtype=np.uint32)
                    target = np.array(unpack('IIIIIIII',
                                             work['target'].decode('hex')),
                                      dtype=np.uint32)
                    state2 = partial(state, data, f)

            self.miner.search(queue, (globalThreads, ), (self.worksize, ),
                              state[0], state[1], state[2], state[3], state[4],
                              state[5], state[6], state[7], state2[1],
                              state2[2],
                              state2[3], state2[5], state2[6], state2[7],
                              pack('I', base), f[0], f[1], f[2], f[3], f[4],
                              f[5], f[6], f[7], output_buf)
            cl.enqueue_read_buffer(queue, output_buf, output)

            noncesLeft -= globalThreads
            threadsRunPace += globalThreads
            threadsRun += globalThreads
            base = uint32(base + globalThreads)

            now = time()
            t = now - lastRatedPace
            if (t > 1):
                rate = (threadsRunPace / t) / self.rateDivisor
                lastRatedPace = now
                threadsRunPace = 0
                r = lastHashRate / rate
                if r < 0.9 or r > 1.1:
                    globalThreads = max(
                        unit * int((rate * frame * self.rateDivisor) / unit),
                        unit)
                    lastHashRate = rate
            t = now - lastRated
            if (t > self.rate):
                self.hashrate(int((threadsRun / t) / self.rateDivisor))
                lastRated = now
                threadsRun = 0

            queue.finish()

            if output[OUTPUT_SIZE]:
                result = {}
                result['work'] = work
                result['data'] = np.array(data)
                result['state'] = np.array(state)
                result['target'] = target
                result['output'] = np.array(output)
                self.resultQueue.put(result)
                output.fill(0)
                cl.enqueue_write_buffer(queue, output_buf, output)

            if self.updateTime == '':
                if noncesLeft < (TIMEOUT + 1) * globalThreads * self.frames:
                    self.update = True
                    noncesLeft += 0xFFFFFFFFFFFF
                elif 0xFFFFFFFFFFF < noncesLeft < 0xFFFFFFFFFFFF:
                    self.sayLine('warning: job finished, miner is idle')
                    work = None
            elif now - lastNTime > 1:
                data[1] = bytereverse(bytereverse(data[1]) + 1)
                state2 = partial(state, data, f)
                lastNTime = now
Пример #51
0
    def run(self):
        frame = float(1) / float(self.frames)
        window = frame / 30
        upper = frame + window
        lower = frame - window

        unit = self.worksize * 256
        globalThreads = unit

        queue = cl.CommandQueue(self.context)

        base = lastRate = threadsRun = lastNTime = 0
        output = np.zeros(2, np.uint32)
        output_buf = cl.Buffer(self.context,
                               cl.mem_flags.WRITE_ONLY
                               | cl.mem_flags.USE_HOST_PTR,
                               hostbuf=output)

        work = None
        while True:
            if (not work) or (not self.workQueue.empty()):
                try:
                    work = self.workQueue.get(True, 1)
                except Empty:
                    continue
                else:
                    if not work:
                        continue
                    elif work == 'stop':
                        return
                    try:
                        data = np.array(unpack(
                            'IIIIIIIIIIIIIIII',
                            work['data'][128:].decode('hex')),
                                        dtype=np.uint32)
                        state = np.array(unpack(
                            'IIIIIIII', work['midstate'].decode('hex')),
                                         dtype=np.uint32)
                        target = np.array(unpack('IIIIIIII',
                                                 work['target'].decode('hex')),
                                          dtype=np.uint32)
                        state2 = partial(state, data)
                    except Exception as e:
                        self.sayLine('Wrong data format from RPC!')
                        sys.exit()

            kernelStart = time()
            self.miner.search(queue, (globalThreads, ), (self.worksize, ),
                              data[0], data[1], data[2], state[0], state[1],
                              state[2], state[3], state[4], state[5], state[6],
                              state[7], state2[1], state2[2], state2[3],
                              state2[5], state2[6], state2[7], target[6],
                              target[7], pack('I', base), output_buf)
            cl.enqueue_read_buffer(queue, output_buf, output)

            if (time() - lastRate > self.rate):
                self.say(
                    '%s khash/s',
                    int((threadsRun / (time() - lastRate)) / self.rateDivisor))
                threadsRun = 0
                lastRate = time()

            queue.finish()
            kernelTime = time() - kernelStart

            threadsRun += globalThreads
            base = uint32(base + globalThreads)

            if (kernelTime < lower):
                globalThreads += unit
            elif (kernelTime > upper and globalThreads > unit):
                globalThreads -= unit

            if output[0]:
                result = {}
                d = work['data']
                d = d[:136] + pack('I', long(
                    data[1])).encode('hex') + d[144:152] + pack(
                        'I', long(output[1])).encode('hex') + d[160:]
                result['data'] = d
                result['hash'] = output[0]
                self.resultQueue.put(result)
                output[0] = 0
                cl.enqueue_write_buffer(queue, output_buf, output)
                work = None
                continue

            if (time() - lastNTime > 1):
                data[1] = bytereverse(bytereverse(data[1]) + 1)
                state2 = partial(state, data)
                lastNTime = time()
Пример #52
0
import pyopencl as cl, numpy
import numpy.linalg as la

a = numpy.arange(16).astype(numpy.float32)

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

a_dev = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, size=a.nbytes)
cl.enqueue_write_buffer(queue, a_dev, a)

prg = cl.Program(
    ctx, """
#pragma OPENCL EXTENSION cl_intel_printf : enable
#pragma OPENCL EXTENSION cl_amd_printf : enable

__kernel void twice(__global float *a) {
    int i = get_local_id(0);
    int local_size = get_local_size(0);
    int group_id = get_group_id(0);
    a[i + local_size * group_id] *= 2;
    //a[i] = float(local_size);
    printf(" -> \n");
}
""").build()

prg.twice(queue, a.shape, (4, ), a_dev)

result = numpy.empty_like(a)
cl.enqueue_read_buffer(queue, a_dev, result).wait()
Пример #53
0
 def to_buf_async(self, cl_buf, stream=None):
     queue = stream.queue if stream is not None else self.default_queue
     cl.enqueue_write_buffer(queue,
                             cl_buf,
                             self.buffers[cl_buf],
                             is_blocking=False)
Пример #54
0
    def process_sub_matrix(self, *args, **kwargs):
        device = kwargs['device']
        sub_matrix_queue = kwargs['sub_matrix_queue']

        context = self.opencl.contexts[device]
        command_queue = self.opencl.command_queues[device]
        program = self.opencl.programs[device]

        vertical_kernel = cl.Kernel(program, 'vertical')
        diagonal_kernel = cl.Kernel(program,
                                    self.settings.diagonal_kernel_name)

        while True:
            try:
                sub_matrix = sub_matrix_queue.get(False)

                transfer_from_device_events = []
                transfer_to_device_events = []
                create_matrix_events = []
                vertical_events = []
                diagonal_events = []

                # Vectors X
                vectors_x = self.get_vectors_x(sub_matrix)

                vectors_x_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_ONLY,
                    vectors_x.size * vectors_x.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            vectors_x_buffer,
                                            vectors_x,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # Vectors Y
                vectors_y = self.get_vectors_y(sub_matrix)

                vectors_y_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_ONLY,
                    vectors_y.size * vectors_y.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            vectors_y_buffer,
                                            vectors_y,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # Recurrence points
                recurrence_points, \
                    recurrence_points_start, \
                    recurrence_points_end = self.get_recurrence_points(sub_matrix)

                recurrence_points_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    recurrence_points.size * recurrence_points.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            recurrence_points_buffer,
                                            recurrence_points,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # Vertical frequency distribution
                vertical_frequency_distribution = self.get_empty_local_frequency_distribution(
                )

                vertical_frequency_distribution_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    vertical_frequency_distribution.size *
                    vertical_frequency_distribution.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(
                        command_queue,
                        vertical_frequency_distribution_buffer,
                        vertical_frequency_distribution,
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                # White vertical frequency distribution
                white_vertical_frequency_distribution = self.get_empty_local_frequency_distribution(
                )

                white_vertical_frequency_distribution_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    white_vertical_frequency_distribution.size *
                    white_vertical_frequency_distribution.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(
                        command_queue,
                        white_vertical_frequency_distribution_buffer,
                        white_vertical_frequency_distribution,
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                # Diagonal frequency distribution
                diagonal_frequency_distribution = self.get_empty_local_frequency_distribution(
                )

                diagonal_frequency_distribution_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    diagonal_frequency_distribution.size *
                    diagonal_frequency_distribution.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(
                        command_queue,
                        diagonal_frequency_distribution_buffer,
                        diagonal_frequency_distribution,
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                # Vertical carryover
                vertical_carryover, \
                    vertical_carryover_start,\
                    vertical_carryover_end = self.get_vertical_length_carryover(sub_matrix)

                vertical_carryover_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    vertical_carryover.size * vertical_carryover.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            vertical_carryover_buffer,
                                            vertical_carryover,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # White vertical carryover
                white_vertical_carryover, \
                    white_vertical_carryover_start,\
                    white_vertical_carryover_end = self.get_white_vertical_length_carryover(sub_matrix)

                white_vertical_carryover_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    white_vertical_carryover.size *
                    white_vertical_carryover.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            white_vertical_carryover_buffer,
                                            white_vertical_carryover,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # Diagonal carryover
                diagonal_carryover, \
                    diagonal_carryover_start, \
                    diagonal_carryover_end = self.get_diagonal_length_carryover(sub_matrix)

                diagonal_carryover_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    diagonal_carryover.size * diagonal_carryover.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            diagonal_carryover_buffer,
                                            diagonal_carryover,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                command_queue.finish()

                # Vertical kernel
                vertical_args = [
                    vectors_x_buffer, vectors_y_buffer,
                    np.uint32(sub_matrix.dim_x),
                    np.uint32(sub_matrix.dim_y),
                    np.uint32(self.settings.embedding_dimension),
                    np.float32(self.settings.neighbourhood.radius),
                    recurrence_points_buffer,
                    vertical_frequency_distribution_buffer,
                    vertical_carryover_buffer,
                    white_vertical_frequency_distribution_buffer,
                    white_vertical_carryover_buffer
                ]

                OpenCL.set_kernel_args(vertical_kernel, vertical_args)

                global_work_size = [
                    int(sub_matrix.dim_x +
                        (device.max_work_group_size -
                         (sub_matrix.dim_x % device.max_work_group_size)))
                ]
                local_work_size = None

                vertical_events.append(
                    cl.enqueue_nd_range_kernel(command_queue, vertical_kernel,
                                               global_work_size,
                                               local_work_size))

                command_queue.finish()

                # Diagonal kernel
                if self.settings.is_matrix_symmetric:
                    diagonal_args = [
                        vectors_x_buffer, vectors_y_buffer,
                        np.uint32(sub_matrix.dim_x),
                        np.uint32(sub_matrix.dim_y),
                        np.uint32(sub_matrix.start_x),
                        np.uint32(sub_matrix.start_y),
                        np.uint32(self.settings.embedding_dimension),
                        np.float32(self.settings.neighbourhood.radius),
                        np.uint32(self.settings.theiler_corrector),
                        np.uint32(self.get_diagonal_offset(sub_matrix)),
                        diagonal_frequency_distribution_buffer,
                        diagonal_carryover_buffer
                    ]

                    global_work_size = [
                        int(sub_matrix.dim_x +
                            (device.max_work_group_size -
                             (sub_matrix.dim_x % device.max_work_group_size)))
                    ]

                else:
                    diagonal_args = [
                        vectors_x_buffer, vectors_y_buffer,
                        np.uint32(sub_matrix.dim_x + sub_matrix.dim_y - 1),
                        np.uint32(sub_matrix.dim_y),
                        np.uint32(sub_matrix.start_x),
                        np.uint32(sub_matrix.start_y),
                        np.uint32(self.settings.embedding_dimension),
                        np.float32(self.settings.neighbourhood.radius),
                        np.uint32(self.settings.theiler_corrector),
                        diagonal_frequency_distribution_buffer,
                        diagonal_carryover_buffer
                    ]

                    global_work_size_x = sub_matrix.dim_x + sub_matrix.dim_y - 1
                    global_work_size = [
                        int(global_work_size_x + (
                            device.max_work_group_size -
                            (global_work_size_x % device.max_work_group_size)))
                    ]

                OpenCL.set_kernel_args(diagonal_kernel, diagonal_args)

                local_work_size = None

                diagonal_events.append(
                    cl.enqueue_nd_range_kernel(command_queue, diagonal_kernel,
                                               global_work_size,
                                               local_work_size))

                command_queue.finish()

                # Read buffer
                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        recurrence_points_buffer,
                        self.recurrence_points[
                            recurrence_points_start:recurrence_points_end],
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        vertical_frequency_distribution_buffer,
                        vertical_frequency_distribution,
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        vertical_carryover_buffer,
                        self.vertical_length_carryover[
                            vertical_carryover_start:vertical_carryover_end],
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        white_vertical_frequency_distribution_buffer,
                        white_vertical_frequency_distribution,
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        white_vertical_carryover_buffer,
                        self.white_vertical_length_carryover[
                            white_vertical_carryover_start:
                            white_vertical_carryover_end],
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        diagonal_frequency_distribution_buffer,
                        diagonal_frequency_distribution,
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        diagonal_carryover_buffer,
                        self.diagonal_length_carryover[
                            diagonal_carryover_start:diagonal_carryover_end],
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                command_queue.finish()

                # Update frequency distributions
                self.threads_vertical_frequency_distribution[
                    device] += vertical_frequency_distribution
                self.threads_white_vertical_frequency_distribution[
                    device] += white_vertical_frequency_distribution
                self.threads_diagonal_frequency_distribution[
                    device] += diagonal_frequency_distribution

                # Get events runtimes
                runtimes = Runtimes()
                runtimes.transfer_to_device = self.opencl.convert_events_runtime(
                    transfer_to_device_events)
                runtimes.transfer_from_device = self.opencl.convert_events_runtime(
                    transfer_from_device_events)
                runtimes.create_matrix = self.opencl.convert_events_runtime(
                    create_matrix_events)
                runtimes.detect_vertical_lines = self.opencl.convert_events_runtime(
                    vertical_events)
                runtimes.detect_diagonal_lines = self.opencl.convert_events_runtime(
                    diagonal_events)

                self.threads_runtimes[device] += runtimes

            except Queue.Empty:
                break
Пример #55
0
'''

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

m = 2**13
n = 2**13

#workgroup_size = None
# Slow
# workgroup_size = (1,1)
workgroup_size = (2**4, 2**4)

a = np.arange(m * n, dtype=np.float32).reshape((m, n))
a_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, size=a.nbytes)
cl.enqueue_write_buffer(queue, a_buf, a)

prg = cl.Program(
    ctx, """
__kernel
void transpose(
    __global float *a,
    __global float *a_t ){

    int m = get_global_size(0);
    int n = get_global_size(1);

    int i = get_global_id (0);
    int j = get_global_id (1);

    int read_idx = j + i * n;
Пример #56
0
 def write_array(self, data, **kwargs):
     queue = get_device().queue
     return cl.enqueue_write_buffer(queue, self.data, data, **kwargs)
Пример #57
0
        # note since the data in f2 becomes f1 we do not need to push two frames
        # every time we go to a new interval.  By using an additional flag we can
        # set the linear interpolation in the kernel to be correct whether or not
        # f1 is behind or in front of f2
        if (f1 != flast):
            flast = f1
            #uf1_buf = uf2_buf
            #vf1_buf = vf2_buf

            uf2 = fin.var('ua')[f2, :]
            vf2 = fin.var('va')[f2, :]

            if (behind[0] == 1):

                event = cl.enqueue_write_buffer(a_queue,
                                                uf2_buf,
                                                uf2,
                                                is_blocking=False)
                #event.wait()
                #elap_wf2 = elap_wf2 + 1e-9*(event.profile.end - event.profile.start)

                event = cl.enqueue_write_buffer(b_queue,
                                                vf2_buf,
                                                vf2,
                                                is_blocking=False)
                #event.wait()
                #elap_wf2 = elap_wf2 + 1e-9*(event.profile.end - event.profile.start)

                behind = numpy.zeros(1, dtype=dtype_int)
            else:
                event = cl.enqueue_write_buffer(a_queue,
                                                uf1_buf,
Пример #58
0
def load_filt(filt):
    (L, T), (R, B) = rect
    assert filt.dtype == np.float32
    assert filt.shape[0] == B - T
    assert filt.shape[1] == R - L
    return cl.enqueue_write_buffer(queue, filt_buf, filt, is_blocking=False)
    sys.exit()

# Allocation
f = np.zeros((nx, ny, nz), 'f')
cf = np.ones_like(f) * 0.5

mf = cl.mem_flags
eh_gpus = []
ce_gpus = []
for i, queue in enumerate(queues):
    eh_gpus.append(
        [cl.Buffer(context, mf.READ_WRITE, f.nbytes) for m in range(6)])
    ce_gpus.append(
        [cl.Buffer(context, mf.READ_ONLY, cf.nbytes) for m in range(3)])
    for j in xrange(6):
        cl.enqueue_write_buffer(queue, eh_gpus[i][j], f)
    for j in xrange(3):
        cl.enqueue_write_buffer(queue, ce_gpus[i][j], cf)

b_offset = (nx - 1) * ny * nz * np.nbytes['float32']
tmpfs = []
for i, queue in enumerate(queues):
    #tmpfs.append( [cl.Buffer(context, mf.READ_WRITE | mf.ALLOC_HOST_PTR, f.nbytes/nx) for m in range(2)] )
    tmpfs.append([np.zeros((ny, nz), dtype=np.float32) for m in range(2)])

# Program and Kernel
Ls = 256
Gs = get_optimal_global_work_size(device)
print('Ls = %d, Gs = %d' % (Ls, Gs))

kern = open('./fdtd3d.cl').read()