def init_buffers(self): shape = self.op.args[0].tensor_description().shape dtype = self.op.args[0].tensor_description().dtype # Allocate output and scratch buffers self.output_buff = gpuarray.zeros(shape, dtype) self.scratch_buff = gpuarray.zeros(shape, dtype) self.output_buff_dict[self.device_id] = self.output_buff.gpudata self.scratch_buff_dict[self.device_id] = self.scratch_buff.gpudata # Allocate IPC handles output_ipc_hdl = drv.mem_get_ipc_handle(self.output_buff.gpudata) scratch_ipc_hdl = drv.mem_get_ipc_handle(self.scratch_buff.gpudata) event_ipc_hdl = self.event.ipc_handle() # Broadcast handles to others msg = (self.device_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl) self.comm.bcast(msg, root=self.device_id) # Get handles from others for i in self.op.device_ids: if i != self.device_id: (peer_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl) =\ self.comm.bcast(msg, root=i) output_hdl = drv.IPCMemoryHandle(output_ipc_hdl) scratch_hdl = drv.IPCMemoryHandle(scratch_ipc_hdl) event_hdl = drv.Event.from_ipc_handle(event_ipc_hdl) self.output_buff_dict[peer_id] = output_hdl self.scratch_buff_dict[peer_id] = scratch_hdl self.event_buff_dict[peer_id] = event_hdl
def init_buffers(self): shape = self.op.args[0].tensor_description().shape dtype = self.op.args[0].tensor_description().dtype n_devs = len(self.op.device_ids) size = self.op.args[0].tensor_description().axes.size segment_size = calculate_segment_size(size, n_devs) # Allocate output and scratch buffers self.output_buff = gpuarray.zeros(shape, dtype) self.scratch_buff = gpuarray.zeros(segment_size * n_devs, dtype) self.output_buff_dict[self.device_id] = self.output_buff.gpudata self.scratch_buff_dict[self.device_id] = self.scratch_buff.gpudata # Allocate IPC handles output_ipc_hdl = drv.mem_get_ipc_handle(self.output_buff.gpudata) scratch_ipc_hdl = drv.mem_get_ipc_handle(self.scratch_buff.gpudata) event_ipc_hdl = self.event.ipc_handle() # Broadcast handles to others msg = (self.device_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl) for i in self.device_ids: if i == self.device_id: self.comm.bcast(msg, root=i) else: (peer_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl) = self.comm.bcast(None, root=i) output_hdl = drv.IPCMemoryHandle(output_ipc_hdl) scratch_hdl = drv.IPCMemoryHandle(scratch_ipc_hdl) event_hdl = drv.Event.from_ipc_handle(event_ipc_hdl) self.output_buff_dict[peer_id] = output_hdl self.scratch_buff_dict[peer_id] = scratch_hdl self.event_buff_dict[peer_id] = event_hdl
def init_buffers(self): shape = self.op.args[0].tensor_description().shape dtype = self.op.args[0].tensor_description().dtype # Allocate output and scratch buffers self.output_buff = gpuarray.zeros(shape, dtype) self.scratch_buff = gpuarray.zeros(shape, dtype) self.output_buff_dict[self.device_id] = self.output_buff.gpudata self.scratch_buff_dict[self.device_id] = self.scratch_buff.gpudata # Allocate IPC handles output_ipc_hdl = drv.mem_get_ipc_handle(self.output_buff.gpudata) scratch_ipc_hdl = drv.mem_get_ipc_handle(self.scratch_buff.gpudata) event_ipc_hdl = self.event.ipc_handle() # Put handles in queues for i in self.op.shared_queues.keys(): if i != self.device_id: self.op.shared_queues[i].put((self.device_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl)) # Get handles from others q = self.op.shared_queues[self.device_id] for i in range(len(self.op.shared_queues) - 1): peer_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl = q.get() output_hdl = drv.IPCMemoryHandle(output_ipc_hdl) scratch_hdl = drv.IPCMemoryHandle(scratch_ipc_hdl) event_hdl = drv.Event.from_ipc_handle(event_ipc_hdl) self.output_buff_dict[peer_id] = output_hdl self.scratch_buff_dict[peer_id] = scratch_hdl self.event_buff_dict[peer_id] = event_hdl
def set_ipc_handle(op, shared_queue, handle): lock = drv.mem_alloc(1) drv.memset_d8(lock, 0, 1) buf_ipc_hdl = drv.mem_get_ipc_handle(handle) lock_ipc_hdl = drv.mem_get_ipc_handle(lock) shared_queue.put((buf_ipc_hdl, lock_ipc_hdl)) return (lock)
def set_ipc_handle(op, shared_queue, handle, local=False): lock = drv.mem_alloc(1) drv.memset_d8(lock, 0, 1) if local: buf_ipc_hdl = int(handle) lock_ipc_hdl = int(lock) else: buf_ipc_hdl = drv.mem_get_ipc_handle(handle) lock_ipc_hdl = drv.mem_get_ipc_handle(lock) shared_queue.put((local, buf_ipc_hdl, lock_ipc_hdl)) return (lock)
def bcast_ipc_handle(comm, handle=None): if handle is not None: buffer_ipc_handle = drv.mem_get_ipc_handle(handle) return comm.bcast(buffer_ipc_handle) else: handle = comm.bcast(handle) return drv.IPCMemoryHandle(handle)
def para_load_init(self): # 0. send config dict (can't carry any special objects) to loading process self.icomm.isend(self.config,dest=0,tag=99) drv = self.drv shared_x = self.model.shared_x img_mean = self.data[4] sock_data = self.config['sock_data'] import zmq sock = zmq.Context().socket(zmq.PAIR) sock.connect('tcp://localhost:{0}'.format(sock_data)) #import theano.sandbox.cuda #theano.sandbox.cuda.use(config.device) import theano.misc.pycuda_init import theano.misc.pycuda_utils # pass ipc handle and related information gpuarray_batch = theano.misc.pycuda_utils.to_gpuarray( shared_x.container.value) h = drv.mem_get_ipc_handle(gpuarray_batch.ptr) # 1. send ipc handle of shared_x sock.send_pyobj((gpuarray_batch.shape, gpuarray_batch.dtype, h)) # 2. send img_mean self.icomm.send(img_mean, dest=0, tag=66)
def para_load_init(self): # 0. send config dict (can't carry any special objects) to loading process self.icomm.isend(self.config, dest=0, tag=99) drv = self.drv shared_x = self.model.shared_x img_mean = self.data[4] sock_data = self.config['sock_data'] import zmq sock = zmq.Context().socket(zmq.PAIR) sock.connect('tcp://localhost:{0}'.format(sock_data)) #import theano.sandbox.cuda #theano.sandbox.cuda.use(config.device) import theano.misc.pycuda_init import theano.misc.pycuda_utils # pass ipc handle and related information gpuarray_batch = theano.misc.pycuda_utils.to_gpuarray( shared_x.container.value) h = drv.mem_get_ipc_handle(gpuarray_batch.ptr) # 1. send ipc handle of shared_x sock.send_pyobj((gpuarray_batch.shape, gpuarray_batch.dtype, h)) # 2. send img_mean self.icomm.send(img_mean, dest=0, tag=66)
def getGdfHandles(self, df): dev = drv.Device(0) gdfHandless = [] for name, series in df._cols.items(): # WSM TODO add if statement for valid != nullptr gdfHandless.append( gdfHandles( drv.mem_get_ipc_handle(series._column.cffi_view.data), drv.mem_get_ipc_handle(series._column.cffi_view.valid), series._column.cffi_view.size, series._column.cffi_view.null_count, series._column.cffi_view.dtype, series._column.cffi_view.dtype_info, name, None)) return gdfHandles
def proc1(): sock = zmq.Context().socket(zmq.REQ) sock.connect('tcp://localhost:5000') drv.init() dev = drv.Device(0) ctx = dev.make_context() x_gpu = gpuarray.to_gpu(np.random.rand(8)) h = drv.mem_get_ipc_handle(x_gpu.ptr) sock.send_pyobj((x_gpu.shape, x_gpu.dtype, h)) sock.recv_pyobj() ctx.detach()
def func1(): drv.init() dev = drv.Device(0) ctx_gpu = dev.make_context() ctx = zmq.Context() sock = ctx.socket(zmq.REQ) sock.connect('tcp://localhost:6000') x_gpu = create_sample_device_data() h = drv.mem_get_ipc_handle(x_gpu) sock.send_pyobj(h) ctx_gpu.pop()
def setup_ipc_handle(op, comm, cmd, handle=None, dest=None): if cmd == 'send': for d in dest: if op.metadata['device_id'] == int(d): local = True buf_ipc_hdl = int(handle) else: local = False buf_ipc_hdl = drv.mem_get_ipc_handle(handle) comm.send((local, buf_ipc_hdl), dest=int(d), tag=TAG_IPC) else: (local, buf_ipc_hdl) = comm.recv(source=op.source_id, tag=TAG_IPC) if local: return (buf_ipc_hdl) else: return (drv.IPCMemoryHandle(buf_ipc_hdl))
def __init__(self, array): """Creates an IPC memory handle of the device array. Args: array (~pycuda.gpuarray.GPUArray): GPU array to be shared accross processes. """ if isinstance(array, drv.IPCMemoryHandle): # do not doubly extract IPC memory handle self.handle = array.ipc_handle else: self.handle = drv.mem_get_ipc_handle(array.ptr) self.shape = array.shape self.dtype = array.dtype self.size = array.size self.mem_size = array.mem_size
def client(): drv.init() dev = drv.Device(0) ctx_gpu = dev.make_context() connection = blazingdb.protocol.UnixSocketConnection(unix_path) sock = blazingdb.protocol.Client(connection) x_gpu = create_sample_device_data() print('gpu type: ') print(type(x_gpu)) h = drv.mem_get_ipc_handle(x_gpu) print('send handler') print(h) res = sock.send(bytes(h)) print(res) ctx_gpu.pop()
def para_load_init(queue_dict, drv, shared_x,img_mean): sock_data = queue_dict['sock_data'] load_send_queue = queue_dict['queue_t2l'] load_recv_queue = queue_dict['queue_l2t'] import zmq sock = zmq.Context().socket(zmq.PAIR) sock.connect('tcp://localhost:{0}'.format(sock_data)) #import theano.sandbox.cuda #theano.sandbox.cuda.use(config.device) import theano.misc.pycuda_init import theano.misc.pycuda_utils # pass ipc handle and related information gpuarray_batch = theano.misc.pycuda_utils.to_gpuarray( shared_x.container.value) h = drv.mem_get_ipc_handle(gpuarray_batch.ptr) # 1. send ipc handle of shared_x sock.send_pyobj((gpuarray_batch.shape, gpuarray_batch.dtype, h)) # 2. send img_mean load_send_queue.put(img_mean)
def ipc_handle(self, addr): return cuda.mem_get_ipc_handle(addr)
def train_net(config, private_config): # UNPACK CONFIGS (train_videos_spatial_jhmdb,val_videos_spatial_jhmdb,train_videos_temporal_jhmdb,val_videos_temporal_jhmdb, train_targets,val_targets, train_labels_jhmdb,val_labels_jhmdb) = unpack_configs_jhmdb(config,gpu_id=private_config['gpu_id']) # print('val_len',len(val_videos_spatial_jhmdb),'train_len',len(train_videos_spatial_jhmdb)) if config['modal']=='rgb': train_videos = list(train_videos_spatial_jhmdb) test_videos = list(val_videos_spatial_jhmdb) else: train_videos = list(train_videos_temporal_jhmdb) test_videos = list(val_videos_temporal_jhmdb) print('jhmdb_len',len(train_videos),len(train_labels_jhmdb))#,len(tr_video_length_jhmdb)) flag_para_load =config['para_load'] gpu_send_queue = private_config['queue_gpu_send'] gpu_recv_queue = private_config['queue_gpu_recv'] # pycuda and zmq set up drv.init() dev = drv.Device(int(private_config['gpu'][-1])) ctx = dev.make_context() sock_gpu = zmq.Context().socket(zmq.PAIR) if private_config['flag_client']: sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter print 'training cost:', cost_ij,'cost_nll:',cost_nll,'cost_attention:',cost_att if config['print_train_error']: error_ij = train_error() gpu_send_queue.put(error_ij) that_error = gpu_recv_queue.get() error_ij = (error_ij + that_error) / 2. if private_config['flag_verbose']: print 'training error rate:', error_ij if flag_para_load and (count < len(minibatch_range)): load_send_queue.put('calc_finished') if count%20 == 0: e = time.time() print "time per 20 iter:", (e - s) # ############### Test on Validation Set ################## DropoutLayer.SetDropoutOff() this_val_error, this_val_loss = get_test_error(config, shared_x, shared_mask, shared_y,shared_target,shared_use_noise, shared_conv,test_videos, val_labels_jhmdb, flag_para_load, batch_size,num_seq, validate_model_lstm,train_model, send_queue=load_send_queue, recv_queue=load_recv_queue) # report validation stats gpu_send_queue.put(this_val_error) that_val_error = gpu_recv_queue.get() this_val_error = (this_val_error + that_val_error) / 2. gpu_send_queue.put(this_val_loss) that_val_loss = gpu_recv_queue.get() this_val_loss = (this_val_loss + that_val_loss) / 2. if private_config['flag_verbose']: print('epoch %i: test loss of jhmdb %f ' % (epoch, this_val_loss)) print('epoch %i: test error of jhmdb %f %%' % (epoch, this_val_error * 100.)) val_record.append([this_val_error, this_val_loss]) if private_config['flag_save']: np.save(config['weights_dir'] + 'test_record_jhmdb.npy', val_record) DropoutLayer.SetDropoutOn() ########################################### # Adapt Learning Rate step_idx = adjust_learning_rate(config, epoch, step_idx, val_record, learning_rate) # Save Weights, only one of them will do if private_config['flag_save'] : if epoch % config['snapshot_freq'] == 0: save_weights(layers, config['weights_dir'], epoch) np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy', learning_rate.get_value()) save_momentums(vels, config['weights_dir'], epoch) print('Optimization complete.')
def main(): client = PyConnector('/tmp/orchestrator.socket', '/tmp/ral.socket') cuda.init() dev = cuda.Device(0) ctx_gpu = dev.make_context() try: client.connect() except Error as err: print(err) try: client.run_ddl_create_table('nation', ['id'], ['GDF_INT8'], 'main') except Error as err: print(err) data_gpu, data_sz = create_sample_device_data() data_handler = bytes(cuda.mem_get_ipc_handle(data_gpu)) valid_gpu, data_sz = create_sample_device_data() valid_handler = bytes(cuda.mem_get_ipc_handle(valid_gpu)) try: tableGroup = { 'tables': [{ 'name': 'main.nation', 'columns': [{ 'data': data_handler, 'valid': valid_handler, 'size': data_sz, 'dtype': 1, 'null_count': 0, 'dtype_info': 0 }], 'columnNames': ['id'] }], 'name': 'main', } resultSet = client.run_dml_query('select id > 5 from main.nation', tableGroup) print("#RESULT_SET:") print('GetResult Response') print(' metadata:') print(' status: %s' % resultSet.metadata.status) print(' message: %s' % resultSet.metadata.message) print(' time: %s' % resultSet.metadata.time) print(' rows: %s' % resultSet.metadata.rows) print(' columnNames: %s' % list(resultSet.columnNames)) for i, column in enumerate(resultSet.columns): x_ptr = cuda.IPCMemoryHandle( column.data) # x_ptr: device raw pointer x_gpu = gpuarray.GPUArray((1, column.size), numpy.int8, gpudata=x_ptr) print('\tgpu: ', x_gpu.get()) print("#RESULT_SET:") resultSet = client.free_result(123456) except Error as err: print(err) # try: # client.run_ddl_drop_table('User', 'main') # except Error as err: # print(err) client.close_connection() ctx_gpu.pop()
def fun_mlp(shared_args, private_args, this_queue, that_queue): ''' shared_args contains neural network parameters private_args contains parameters for process run on each gpu this_queue and that_queue are used for synchronization between processes. ''' learning_rate = shared_args['learning_rate'] n_epochs = shared_args['n_epochs'] dataset = shared_args['dataset'] batch_size = shared_args['batch_size'] L1_reg = shared_args['L1_reg'] L2_reg = shared_args['L2_reg'] n_hidden = shared_args['n_hidden'] #### # pycuda and zmq environment drv.init() dev = drv.Device(private_args['ind_gpu']) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) if private_args['flag_client']: sock.connect('tcp://localhost:5000') else: sock.bind('tcp://*:5000') #### #### # import theano related import theano.sandbox.cuda theano.sandbox.cuda.use(private_args['gpu']) import theano import theano.tensor as T from logistic_sgd import load_data from mlp import MLP import theano.misc.pycuda_init import theano.misc.pycuda_utils #### datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(1234) classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10) cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size]} ) gparams = [T.grad(cost, param) for param in classifier.params] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size]}) #### # setting pycuda and # pass handles, only done once param_ga_list = [] # a list of pycuda gpuarrays which point to value of theano shared variable on this gpu param_other_list = [] # a list of theano shared variables that are used to store values of theano shared variable from the other gpu param_ga_other_list = [] # a list of pycuda gpuarrays which point to theano shared variables in param_other_list h_list = [] # a list of pycuda IPC handles shape_list = [] # a list containing shapes of variables in param_ga_list dtype_list = [] # a list containing dtypes of variables in param_ga_list average_fun_list = [] # a list containing theano functions for averaging parameters for param in classifier.params: param_other = theano.shared(param.get_value()) param_ga = \ theano.misc.pycuda_utils.to_gpuarray(param.container.value) param_ga_other = \ theano.misc.pycuda_utils.to_gpuarray( param_other.container.value) h = drv.mem_get_ipc_handle(param_ga.ptr) average_fun = \ theano.function([], updates=[(param, (param + param_other) / 2.)]) param_other_list.append(param_other) param_ga_list.append(param_ga) param_ga_other_list.append(param_ga_other) h_list.append(h) shape_list.append(param_ga.shape) dtype_list.append(param_ga.dtype) average_fun_list.append(average_fun) # pass shape, dtype and handles sock.send_pyobj((shape_list, dtype_list, h_list)) shape_other_list, dtype_other_list, h_other_list = sock.recv_pyobj() param_ga_remote_list = [] # create gpuarray point to the other gpu use the passed information for shape_other, dtype_other, h_other in zip(shape_other_list, dtype_other_list, h_other_list): param_ga_remote = \ gpuarray.GPUArray(shape_other, dtype_other, gpudata=drv.IPCMemoryHandle(h_other)) param_ga_remote_list.append(param_ga_remote) #### ############### # TRAIN MODEL # ############### print '... training' this_queue.put('') that_queue.get() start_time = time.time() epoch = 0 while epoch < n_epochs: epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): if minibatch_index % 2 == private_args['mod']: train_model(minibatch_index) this_queue.put('') that_queue.get() # exchanging weights for param_ga, param_ga_other, param_ga_remote in \ zip(param_ga_list, param_ga_other_list, param_ga_remote_list): drv.memcpy_peer(param_ga_other.ptr, param_ga_remote.ptr, param_ga_remote.dtype.itemsize * param_ga_remote.size, ctx, ctx) ctx.synchronize() this_queue.put('') that_queue.get() for average_fun in average_fun_list: average_fun() if private_args['verbose']: validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) end_time = time.time() this_queue.put('') that_queue.get() if private_args['verbose']: print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
def fun_mlp(shared_args, private_args, this_queue, that_queue): ''' shared_args contains neural network parameters private_args contains parameters for process run on each gpu this_queue and that_queue are used for synchronization between processes. ''' learning_rate = shared_args['learning_rate'] n_epochs = shared_args['n_epochs'] dataset = shared_args['dataset'] batch_size = shared_args['batch_size'] L1_reg = shared_args['L1_reg'] L2_reg = shared_args['L2_reg'] n_hidden = shared_args['n_hidden'] #### # pycuda and zmq environment drv.init() dev = drv.Device(private_args['ind_gpu']) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) if private_args['flag_client']: sock.connect('tcp://localhost:5000') else: sock.bind('tcp://*:5000') #### #### # import theano related import theano.sandbox.cuda theano.sandbox.cuda.use(private_args['gpu']) import theano import theano.tensor as T from logistic_sgd import load_data from mlp import MLP import theano.misc.pycuda_init import theano.misc.pycuda_utils #### datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(1234) classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10) cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) gparams = [T.grad(cost, param) for param in classifier.params] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) #### # setting pycuda and # pass handles, only done once param_ga_list = [] # a list of pycuda gpuarrays which point to value of theano shared variable on this gpu param_other_list = [] # a list of theano shared variables that are used to store values of theano shared variable from the other gpu param_ga_other_list = [] # a list of pycuda gpuarrays which point to theano shared variables in param_other_list h_list = [] # a list of pycuda IPC handles shape_list = [] # a list containing shapes of variables in param_ga_list dtype_list = [] # a list containing dtypes of variables in param_ga_list average_fun_list = [] # a list containing theano functions for averaging parameters for param in classifier.params: param_other = theano.shared(param.get_value()) param_ga = \ theano.misc.pycuda_utils.to_gpuarray(param.container.value) param_ga_other = \ theano.misc.pycuda_utils.to_gpuarray( param_other.container.value) h = drv.mem_get_ipc_handle(param_ga.ptr) average_fun = \ theano.function([], updates=[(param, (param + param_other) / 2.)]) param_other_list.append(param_other) param_ga_list.append(param_ga) param_ga_other_list.append(param_ga_other) h_list.append(h) shape_list.append(param_ga.shape) dtype_list.append(param_ga.dtype) average_fun_list.append(average_fun) # pass shape, dtype and handles sock.send_pyobj((shape_list, dtype_list, h_list)) shape_other_list, dtype_other_list, h_other_list = sock.recv_pyobj() param_ga_remote_list = [] # create gpuarray point to the other gpu use the passed information for shape_other, dtype_other, h_other in zip(shape_other_list, dtype_other_list, h_other_list): param_ga_remote = \ gpuarray.GPUArray(shape_other, dtype_other, gpudata=drv.IPCMemoryHandle(h_other)) param_ga_remote_list.append(param_ga_remote) #### ############### # TRAIN MODEL # ############### print '... training' this_queue.put('') that_queue.get() start_time = time.time() epoch = 0 while epoch < n_epochs: epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): if minibatch_index % 2 == private_args['mod']: train_model(minibatch_index) this_queue.put('') that_queue.get() # exchanging weights for param_ga, param_ga_other, param_ga_remote in \ zip(param_ga_list, param_ga_other_list, param_ga_remote_list): drv.memcpy_peer( param_ga_other.ptr, param_ga_remote.ptr, param_ga_remote.dtype.itemsize * param_ga_remote.size, ctx, ctx) ctx.synchronize() this_queue.put('') that_queue.get() for average_fun in average_fun_list: average_fun() if private_args['verbose']: validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) end_time = time.time() this_queue.put('') that_queue.get() if private_args['verbose']: print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
def validate_performance(config): # UNPACK CONFIGS (flag_para_load, train_filenames, val_filenames, train_labels, val_labels, img_mean) = unpack_configs(config) if flag_para_load: # pycuda and zmq set up drv.init() dev = drv.Device(int(config['gpu'][-1])) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) sock.connect('tcp://localhost:{0}'.format(config['sock_data'])) load_send_queue = config['queue_t2l'] load_recv_queue = config['queue_l2t'] else: load_send_queue = None load_recv_queue = None import theano.sandbox.cuda theano.sandbox.cuda.use(config['gpu']) import theano theano.config.on_unused_input = 'warn' from layers import DropoutLayer from alex_net import AlexNet, compile_models import theano.misc.pycuda_init import theano.misc.pycuda_utils # # BUILD NETWORK ## model = AlexNet(config) layers = model.layers batch_size = model.batch_size # # COMPILE FUNCTIONS ## (train_model, validate_model, train_error, learning_rate, shared_x, shared_y, rand_arr, vels) = compile_models(model, config, flag_top_5=True) print '... training' if flag_para_load: # pass ipc handle and related information gpuarray_batch = theano.misc.pycuda_utils.to_gpuarray( shared_x.container.value) h = drv.mem_get_ipc_handle(gpuarray_batch.ptr) sock.send_pyobj((gpuarray_batch.shape, gpuarray_batch.dtype, h)) load_send_queue.put(img_mean) load_epoch = config['load_epoch'] load_weights(layers, config['weights_dir'], load_epoch) DropoutLayer.SetDropoutOff() this_validation_error, this_validation_error_top_5, this_validation_loss = \ get_val_error_loss(rand_arr, shared_x, shared_y, val_filenames, val_labels, flag_para_load,img_mean, batch_size, validate_model, send_queue=load_send_queue, recv_queue=load_recv_queue, flag_top_5=True) print('validation error %f %%' % (this_validation_error * 100.)) print('top 5 validation error %f %%' % (this_validation_error_top_5 * 100.)) print('validation loss %f ' % (this_validation_loss)) return this_validation_error, this_validation_loss
def train_net(config): # UNPACK CONFIGS (flag_para_load, train_filenames, val_filenames, train_labels, val_labels, img_mean) = unpack_configs(config) # pycuda set up drv.init() dev = drv.Device(int(config['gpu'][-1])) ctx = dev.make_context() if flag_para_load: # zmq set up sock = zmq.Context().socket(zmq.PAIR) sock.connect('tcp://*****:*****@ iter = ', num_iter print 'training cost:', cost_ij if config['print_train_error']: print 'training error rate:', train_error() if flag_para_load and (count < len(minibatch_range)): load_send_queue.put('calc_finished') ############### Test on Validation Set ################## DropoutLayer.SetDropoutOff() this_validation_error, this_validation_loss = get_val_error_loss( rand_arr, shared_x, shared_y, val_filenames, val_labels, flag_para_load, img_mean, batch_size, validate_model, send_queue=load_send_queue, recv_queue=load_recv_queue) print('epoch %i: validation loss %f ' % (epoch, this_validation_loss)) print('epoch %i: validation error %f %%' % (epoch, this_validation_error * 100.)) val_record.append([this_validation_error, this_validation_loss]) np.save(config['weights_dir'] + 'val_record.npy', val_record) DropoutLayer.SetDropoutOn() ############################################ # Adapt Learning Rate step_idx = adjust_learning_rate(config, epoch, step_idx, val_record, learning_rate) # Save weights if epoch % config['snapshot_freq'] == 0: save_weights(layers, config['weights_dir'], epoch) np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy', learning_rate.get_value()) save_momentums(vels, config['weights_dir'], epoch) print('Optimization complete.')
def train_net(config, private_config): # UNPACK CONFIGS (flag_para_load, train_filenames, val_filenames, train_labels, val_labels, img_mean) = \ unpack_configs(config, ext_data=private_config['ext_data'], ext_label=private_config['ext_label']) gpu_send_queue = private_config['queue_gpu_send'] gpu_recv_queue = private_config['queue_gpu_recv'] # pycuda and zmq set up drv.init() dev = drv.Device(int(private_config['gpu'][-1])) ctx = dev.make_context() sock_gpu = zmq.Context().socket(zmq.PAIR) if private_config['flag_client']: sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter log_iter.write("%d\n" % num_iter) log_iter.flush() print 'training cost:', cost_ij log_err_cost.write("%f\n" % cost_ij) log_err_cost.flush() if config['print_train_error']: error_ij = train_error() gpu_send_queue.put(error_ij) that_error = gpu_recv_queue.get() error_ij = (error_ij + that_error) / 2. if private_config['flag_verbose']: print 'training error rate:', error_ij log_err_rate.write("%f\n" % error_ij) log_err_rate.flush() if flag_para_load and (count < len(minibatch_range)): load_send_queue.put('calc_finished') if count%20 == 0: e = time.time() print "time per 20 iter:", (e - s) ############### Test on Validation Set ################## DropoutLayer.SetDropoutOff() this_val_error, this_val_loss = get_val_error_loss( rand_arr, shared_x, shared_y, val_filenames, val_labels, flag_para_load, img_mean, batch_size, validate_model, send_queue=load_send_queue, recv_queue=load_recv_queue) # report validation stats gpu_send_queue.put(this_val_error) that_val_error = gpu_recv_queue.get() this_val_error = (this_val_error + that_val_error) / 2. gpu_send_queue.put(this_val_loss) that_val_loss = gpu_recv_queue.get() this_val_loss = (this_val_loss + that_val_loss) / 2. if private_config['flag_verbose']: print('epoch %i: validation loss %f ' % (epoch, this_val_loss)) print('epoch %i: validation error %f %%' % (epoch, this_val_error * 100.)) val_record.append([this_val_error, this_val_loss]) if private_config['flag_save']: np.save(config['weights_dir'] + 'val_record.npy', val_record) np.savetxt(config['weights_dir'] + 'val_record_txt.txt', val_record) DropoutLayer.SetDropoutOn() ############################################ # Adapt Learning Rate step_idx = adjust_learning_rate(config, epoch, step_idx, val_record, learning_rate) # Save Weights, only one of them will do if private_config['flag_save']: if epoch % config['snapshot_freq'] == 0: save_weights(layers, config['weights_dir'], epoch) np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy', learning_rate.get_value()) save_momentums(vels, config['weights_dir'], epoch) print('Optimization complete.')
def bind_buffers(self): if isinstance(self.tensor, TensorDescription): self.tensor = self.tensor_view_from_td(self.tensor) super(CudaSendKernel, self).bind_buffers() buf_ipc_hdl = drv.mem_get_ipc_handle(self.tensor.tensor.gpudata) self.comm.send(buf_ipc_hdl, dest=self.destination, tag=TAG_IPC)
def train_net(config, private_config): # UNPACK CONFIGS (flag_para_load, flag_datalayer, train_filenames, val_filenames, train_labels, val_labels, img_mean) = \ unpack_configs(config, ext_data=private_config['ext_data'], ext_label=private_config['ext_label']) gpu_send_queue = private_config['queue_gpu_send'] gpu_recv_queue = private_config['queue_gpu_recv'] # pycuda and zmq set up drv.init() dev = drv.Device(int(private_config['gpu'][-1])) ctx = dev.make_context() sock_gpu = zmq.Context().socket(zmq.PAIR) if private_config['flag_client']: sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter print 'training cost:', cost_ij if config['print_train_error']: error_ij = train_error() gpu_send_queue.put(error_ij) that_error = gpu_recv_queue.get() error_ij = (error_ij + that_error) / 2. if private_config['flag_verbose']: print 'training error rate:', error_ij if flag_para_load and (count < len(minibatch_range)): load_send_queue.put('calc_finished') ############### Test on Validation Set ################## DropoutLayer.SetDropoutOff() this_val_error, this_val_loss = get_val_error_loss( rand_arr, shared_x, shared_y, val_filenames, val_labels, flag_datalayer, flag_para_load, batch_size, validate_model, send_queue=load_send_queue, recv_queue=load_recv_queue) # report validation stats gpu_send_queue.put(this_val_error) that_val_error = gpu_recv_queue.get() this_val_error = (this_val_error + that_val_error) / 2. gpu_send_queue.put(this_val_loss) that_val_loss = gpu_recv_queue.get() this_val_loss = (this_val_loss + that_val_loss) / 2. if private_config['flag_verbose']: print('epoch %i: validation loss %f ' % (epoch, this_val_loss)) print('epoch %i: validation error %f %%' % (epoch, this_val_error * 100.)) val_record.append([this_val_error, this_val_loss]) if private_config['flag_save']: np.save(config['weights_dir'] + 'val_record.npy', val_record) DropoutLayer.SetDropoutOn() ############################################ # Adapt Learning Rate step_idx = adjust_learning_rate(config, epoch, step_idx, val_record, learning_rate) # Save Weights, only one of them will do if private_config['flag_save']: if epoch % config['snapshot_freq'] == 0: save_weights(layers, config['weights_dir'], epoch) np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy', learning_rate.get_value()) save_momentums(vels, config['weights_dir'], epoch) print('Optimization complete.')