def fun_load(config, sock_data=5000): send_queue = config['queue_l2t'] recv_queue = config['queue_t2l'] # recv_queue and send_queue are multiprocessing.Queue # recv_queue is only for receiving # send_queue is only for sending # if need to do random crop and mirror flag_randproc = not config['use_data_layer'] flag_batch = config['batch_crop_mirror'] drv.init() dev = drv.Device(int(config['gpu'][-1])) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) sock.bind('tcp://*:{0}'.format(sock_data)) shape, dtype, h = sock.recv_pyobj() print 'shared_x information received', shape, dtype shape = (3, 255, 255, 256) # TODO remove fix gpu_data_remote = gpuarray.GPUArray(shape, dtype, gpudata=drv.IPCMemoryHandle(h)) gpu_data = gpuarray.GPUArray(shape, dtype) img_mean = recv_queue.get() print 'img_mean received' # The first time, do the set ups and other stuff # receive information for loading while True: # getting the hkl file name to load hkl_name = recv_queue.get() # print hkl_name #data = pickle.load(open(hkl_name)) - img_mean data = hkl.load(hkl_name) - img_mean # print 'load ', time.time() - bgn_time if flag_randproc: param_rand = recv_queue.get() data = crop_and_mirror(data, param_rand, flag_batch=flag_batch) gpu_data.set(data) # wait for computation on last minibatch to finish msg = recv_queue.get() assert msg == 'calc_finished' drv.memcpy_peer(gpu_data_remote.ptr, gpu_data.ptr, gpu_data.dtype.itemsize * gpu_data.size, ctx, ctx) ctx.synchronize() send_queue.put('copy_finished')
def fun_load(config, sock_data=5000): send_queue = config['queue_l2t'] recv_queue = config['queue_t2l'] # recv_queue and send_queue are multiprocessing.Queue # recv_queue is only for receiving # send_queue is only for sending # if need to do random crop and mirror flag_randproc = not config['use_data_layer'] flag_batch = config['batch_crop_mirror'] drv.init() dev = drv.Device(int(config['gpu'][-1])) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) sock.bind('tcp://*:{0}'.format(sock_data)) shape, dtype, h = sock.recv_pyobj() print 'shared_x information received' gpu_data_remote = gpuarray.GPUArray(shape, dtype, gpudata=drv.IPCMemoryHandle(h)) gpu_data = gpuarray.GPUArray(shape, dtype) img_mean = recv_queue.get() print 'img_mean received' # The first time, do the set ups and other stuff # receive information for loading while True: # getting the hkl file name to load hkl_name = recv_queue.get() # print hkl_name data = hkl.load(hkl_name) - img_mean # print 'load ', time.time() - bgn_time if flag_randproc: param_rand = recv_queue.get() data = crop_and_mirror(data, param_rand, flag_batch=flag_batch) gpu_data.set(data) # wait for computation on last minibatch to finish msg = recv_queue.get() assert msg == 'calc_finished' drv.memcpy_peer(gpu_data_remote.ptr, gpu_data.ptr, gpu_data.dtype.itemsize * gpu_data.size, ctx, ctx) ctx.synchronize() send_queue.put('copy_finished')
def fun_load(config, sock_data_2=5001): send_queue = config['queue_l2t'] recv_queue = config['queue_t2l'] # recv_queue and send_queue are multiprocessing.Queue # recv_queue is only for receiving # send_queue is only for sending num_timesteps = config['num_timesteps'] num_seq = config['num_seq'] img_scale_x = config['img_scale_x'] img_scale_y = config['img_scale_y'] drv.init() dev = drv.Device(int(config['gpu'][-1])) ctx_2 = dev.make_context() sock_2 = zmq.Context().socket(zmq.PAIR) sock_2.bind('tcp://*:{0}'.format(sock_data_2)) shape_temporal, dtype_temporal, h_temporal = sock_2.recv_pyobj() print 'shared_x information received', shape_temporal gpu_data_remote_temporal = gpuarray.GPUArray( shape_temporal, dtype_temporal, gpudata=drv.IPCMemoryHandle(h_temporal)) gpu_data_temporal = gpuarray.GPUArray(shape_temporal, dtype_temporal) # print 'img_mean received' # The first time, do the set ups and other stuff # receive information for loading while True: video_name_temporal = recv_queue.get() rand_param = recv_queue.get() if config['modal'] == 'rgb': data_temporal = prepare_data_rgb(video_name_temporal, num_timesteps, num_seq, rand_param, data_shape=(img_scale_x, img_scale_y, 3)) else: data_temporal = prepare_data_flow(video_name_temporal, num_timesteps, num_seq, rand_param, data_shape=(img_scale_x, img_scale_y)) gpu_data_temporal.set(data_temporal) # wait for computation on last minibatch to finish msg = recv_queue.get() assert msg == 'calc_finished' drv.memcpy_peer( gpu_data_remote_temporal.ptr, gpu_data_temporal.ptr, gpu_data_temporal.dtype.itemsize * gpu_data_temporal.size, ctx_2, ctx_2) ctx_2.synchronize() send_queue.put('copy_finished')
def fun_load(config, sock_data=5000): send_queue = config["queue_l2t"] recv_queue = config["queue_t2l"] # recv_queue and send_queue are multiprocessing.Queue # recv_queue is only for receiving # send_queue is only for sending # if need to do random crop and mirror flag_batch = config["batch_crop_mirror"] drv.init() dev = drv.Device(int(config["gpu"][-1])) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) sock.bind("tcp://*:{0}".format(sock_data)) shape, dtype, h = sock.recv_pyobj() print "shared_x information received" gpu_data_remote = gpuarray.GPUArray(shape, dtype, gpudata=drv.IPCMemoryHandle(h)) gpu_data = gpuarray.GPUArray(shape, dtype) img_mean = recv_queue.get() print "img_mean received" # The first time, do the set ups and other stuff # receive information for loading while True: # getting the hkl file name to load hkl_name = recv_queue.get() # print hkl_name data = hkl.load(hkl_name) - img_mean # print 'load ', time.time() - bgn_time param_rand = recv_queue.get() data = crop_and_mirror(data, param_rand, flag_batch=flag_batch) gpu_data.set(data) # wait for computation on last minibatch to finish msg = recv_queue.get() assert msg == "calc_finished" drv.memcpy_peer(gpu_data_remote.ptr, gpu_data.ptr, gpu_data.dtype.itemsize * gpu_data.size, ctx, ctx) ctx.synchronize() send_queue.put("copy_finished")
def copy(array, out=None, out_device=None): """Copies a GPUArray object using the default stream. This function can copy the device array to the destination array on another device. Args: array (~pycuda.gpuarray.GPUArray): Array to be copied. out (~pycuda.gpuarray.GPUArray): Destination array. If it is not ``None``, then ``out_device`` argument is ignored. out_device: Destination device specifier. Actual device object is obtained by passing this value to :func:`get_device`. Returns: ~pycuda.gpuarray.GPUArray: Copied array. If ``out`` is not specified, then the array is allocated on the device specified by ``out_device`` argument. """ in_device = get_device(array) if out is None: if out_device is None: out_device = in_device else: out_device = get_device(out_device) with using_device(out_device): out = empty_like(array) else: out_device = get_device(out) with using_device(in_device): if in_device == out_device: drv.memcpy_dtod(out.ptr, array.ptr, out.nbytes) else: drv.memcpy_peer(out.ptr, array.ptr, out.nbytes, out_device, in_device) return out
def copy(array, out=None, out_device=None): """Copies a GPUArray object using the default stream. This function can copy the device array to the destination array on another device. Args: array (~pycuda.gpuarray.GPUArray): Array to be copied. out (~pycuda.gpuarray.GPUArray): Destination array. If it is not ``None``, then ``out_device`` argument is ignored. out_device: Destination device specifier. Actual device object is obtained by passing this value to :func:`get_device`. Returns: ~pycuda.gpuarray.GPUArray: Copied array. If ``out`` is not specified, then the array is allocated on the device specified by ``out_device`` argument. """ in_device = get_device(array) if out is None: if out_device is None: out_device = in_device else: out_device = get_device(out_device) with using_device(out_device): out = empty_like(array) else: out_device = get_device(out) with using_device(in_device): if in_device == out_device: drv.memcpy_dtod(out.ptr, array.ptr, out.nbytes) else: drv.memcpy_peer( out.ptr, array.ptr, out.nbytes, out_device, in_device) return out
def train_net(config, private_config): # UNPACK CONFIGS (train_videos_spatial_jhmdb,val_videos_spatial_jhmdb,train_videos_temporal_jhmdb,val_videos_temporal_jhmdb, train_targets,val_targets, train_labels_jhmdb,val_labels_jhmdb) = unpack_configs_jhmdb(config,gpu_id=private_config['gpu_id']) # print('val_len',len(val_videos_spatial_jhmdb),'train_len',len(train_videos_spatial_jhmdb)) if config['modal']=='rgb': train_videos = list(train_videos_spatial_jhmdb) test_videos = list(val_videos_spatial_jhmdb) else: train_videos = list(train_videos_temporal_jhmdb) test_videos = list(val_videos_temporal_jhmdb) print('jhmdb_len',len(train_videos),len(train_labels_jhmdb))#,len(tr_video_length_jhmdb)) flag_para_load =config['para_load'] gpu_send_queue = private_config['queue_gpu_send'] gpu_recv_queue = private_config['queue_gpu_recv'] # pycuda and zmq set up drv.init() dev = drv.Device(int(private_config['gpu'][-1])) ctx = dev.make_context() sock_gpu = zmq.Context().socket(zmq.PAIR) if private_config['flag_client']: sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter print 'training cost:', cost_ij,'cost_nll:',cost_nll,'cost_attention:',cost_att if config['print_train_error']: error_ij = train_error() gpu_send_queue.put(error_ij) that_error = gpu_recv_queue.get() error_ij = (error_ij + that_error) / 2. if private_config['flag_verbose']: print 'training error rate:', error_ij if flag_para_load and (count < len(minibatch_range)): load_send_queue.put('calc_finished') if count%20 == 0: e = time.time() print "time per 20 iter:", (e - s) # ############### Test on Validation Set ################## DropoutLayer.SetDropoutOff() this_val_error, this_val_loss = get_test_error(config, shared_x, shared_mask, shared_y,shared_target,shared_use_noise, shared_conv,test_videos, val_labels_jhmdb, flag_para_load, batch_size,num_seq, validate_model_lstm,train_model, send_queue=load_send_queue, recv_queue=load_recv_queue) # report validation stats gpu_send_queue.put(this_val_error) that_val_error = gpu_recv_queue.get() this_val_error = (this_val_error + that_val_error) / 2. gpu_send_queue.put(this_val_loss) that_val_loss = gpu_recv_queue.get() this_val_loss = (this_val_loss + that_val_loss) / 2. if private_config['flag_verbose']: print('epoch %i: test loss of jhmdb %f ' % (epoch, this_val_loss)) print('epoch %i: test error of jhmdb %f %%' % (epoch, this_val_error * 100.)) val_record.append([this_val_error, this_val_loss]) if private_config['flag_save']: np.save(config['weights_dir'] + 'test_record_jhmdb.npy', val_record) DropoutLayer.SetDropoutOn() ########################################### # Adapt Learning Rate step_idx = adjust_learning_rate(config, epoch, step_idx, val_record, learning_rate) # Save Weights, only one of them will do if private_config['flag_save'] : if epoch % config['snapshot_freq'] == 0: save_weights(layers, config['weights_dir'], epoch) np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy', learning_rate.get_value()) save_momentums(vels, config['weights_dir'], epoch) print('Optimization complete.')
def fun_mlp(shared_args, private_args, this_queue, that_queue): ''' shared_args contains neural network parameters private_args contains parameters for process run on each gpu this_queue and that_queue are used for synchronization between processes. ''' learning_rate = shared_args['learning_rate'] n_epochs = shared_args['n_epochs'] dataset = shared_args['dataset'] batch_size = shared_args['batch_size'] L1_reg = shared_args['L1_reg'] L2_reg = shared_args['L2_reg'] n_hidden = shared_args['n_hidden'] #### # pycuda and zmq environment drv.init() dev = drv.Device(private_args['ind_gpu']) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) if private_args['flag_client']: sock.connect('tcp://localhost:5000') else: sock.bind('tcp://*:5000') #### #### # import theano related import theano.sandbox.cuda theano.sandbox.cuda.use(private_args['gpu']) import theano import theano.tensor as T from logistic_sgd import load_data from mlp import MLP import theano.misc.pycuda_init import theano.misc.pycuda_utils #### datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(1234) classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10) cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size]} ) gparams = [T.grad(cost, param) for param in classifier.params] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size]}) #### # setting pycuda and # pass handles, only done once param_ga_list = [] # a list of pycuda gpuarrays which point to value of theano shared variable on this gpu param_other_list = [] # a list of theano shared variables that are used to store values of theano shared variable from the other gpu param_ga_other_list = [] # a list of pycuda gpuarrays which point to theano shared variables in param_other_list h_list = [] # a list of pycuda IPC handles shape_list = [] # a list containing shapes of variables in param_ga_list dtype_list = [] # a list containing dtypes of variables in param_ga_list average_fun_list = [] # a list containing theano functions for averaging parameters for param in classifier.params: param_other = theano.shared(param.get_value()) param_ga = \ theano.misc.pycuda_utils.to_gpuarray(param.container.value) param_ga_other = \ theano.misc.pycuda_utils.to_gpuarray( param_other.container.value) h = drv.mem_get_ipc_handle(param_ga.ptr) average_fun = \ theano.function([], updates=[(param, (param + param_other) / 2.)]) param_other_list.append(param_other) param_ga_list.append(param_ga) param_ga_other_list.append(param_ga_other) h_list.append(h) shape_list.append(param_ga.shape) dtype_list.append(param_ga.dtype) average_fun_list.append(average_fun) # pass shape, dtype and handles sock.send_pyobj((shape_list, dtype_list, h_list)) shape_other_list, dtype_other_list, h_other_list = sock.recv_pyobj() param_ga_remote_list = [] # create gpuarray point to the other gpu use the passed information for shape_other, dtype_other, h_other in zip(shape_other_list, dtype_other_list, h_other_list): param_ga_remote = \ gpuarray.GPUArray(shape_other, dtype_other, gpudata=drv.IPCMemoryHandle(h_other)) param_ga_remote_list.append(param_ga_remote) #### ############### # TRAIN MODEL # ############### print '... training' this_queue.put('') that_queue.get() start_time = time.time() epoch = 0 while epoch < n_epochs: epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): if minibatch_index % 2 == private_args['mod']: train_model(minibatch_index) this_queue.put('') that_queue.get() # exchanging weights for param_ga, param_ga_other, param_ga_remote in \ zip(param_ga_list, param_ga_other_list, param_ga_remote_list): drv.memcpy_peer(param_ga_other.ptr, param_ga_remote.ptr, param_ga_remote.dtype.itemsize * param_ga_remote.size, ctx, ctx) ctx.synchronize() this_queue.put('') that_queue.get() for average_fun in average_fun_list: average_fun() if private_args['verbose']: validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) end_time = time.time() this_queue.put('') that_queue.get() if private_args['verbose']: print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
def fun_load(config, sock_data=5000): send_queue = config['queue_l2t'] recv_queue = config['queue_t2l'] # recv_queue and send_queue are multiprocessing.Queue # recv_queue is only for receiving # send_queue is only for sending # if need to do random crop and mirror flag_batch = config['batch_crop_mirror'] cropsize = config['cropsize'] drv.init() dev = drv.Device(int(config['gpu'][-1])) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) sock.bind('tcp://*:{0}'.format(sock_data)) shape, dtype, h = sock.recv_pyobj() print 'shared_x information received' gpu_data_remote = gpuarray.GPUArray(shape, dtype, gpudata=drv.IPCMemoryHandle(h)) gpu_data = gpuarray.GPUArray(shape, dtype) img_mean = recv_queue.get() print 'img_mean received' # The first time, do the set ups and other stuff # receive information for loading while True: # print '... ... in the loop of generating data' # getting the hkl file name to load hkl_name = recv_queue.get() # print hkl_name data = hkl.load(hkl_name) - img_mean # print 'load ', time.time() - bgn_time param_rand = recv_queue.get() # print '... ... get the original hkl size:',data.size data = crop_and_mirror(data, param_rand, flag_batch=flag_batch, cropsize=cropsize) # print '... ... get the cropped size:',data.size # raw_input("Press Enter to continue...") gpu_data.set(data) # wait for computation on last minibatch to finish msg = recv_queue.get() assert msg == 'calc_finished' drv.memcpy_peer(gpu_data_remote.ptr, gpu_data.ptr, gpu_data.dtype.itemsize * gpu_data.size, ctx, ctx) ctx.synchronize() send_queue.put('copy_finished')
def train_net(config, private_config): # UNPACK CONFIGS (flag_para_load, flag_datalayer, train_filenames, val_filenames, train_labels, val_labels, img_mean) = \ unpack_configs(config, ext_data=private_config['ext_data'], ext_label=private_config['ext_label']) gpu_send_queue = private_config['queue_gpu_send'] gpu_recv_queue = private_config['queue_gpu_recv'] # pycuda and zmq set up drv.init() dev = drv.Device(int(private_config['gpu'][-1])) ctx = dev.make_context() sock_gpu = zmq.Context().socket(zmq.PAIR) if private_config['flag_client']: sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter print 'training cost:', cost_ij if config['print_train_error']: error_ij = train_error() gpu_send_queue.put(error_ij) that_error = gpu_recv_queue.get() error_ij = (error_ij + that_error) / 2. if private_config['flag_verbose']: print 'training error rate:', error_ij if flag_para_load and (count < len(minibatch_range)): load_send_queue.put('calc_finished') ############### Test on Validation Set ################## DropoutLayer.SetDropoutOff() this_val_error, this_val_loss = get_val_error_loss( rand_arr, shared_x, shared_y, val_filenames, val_labels, flag_datalayer, flag_para_load, batch_size, validate_model, send_queue=load_send_queue, recv_queue=load_recv_queue) # report validation stats gpu_send_queue.put(this_val_error) that_val_error = gpu_recv_queue.get() this_val_error = (this_val_error + that_val_error) / 2. gpu_send_queue.put(this_val_loss) that_val_loss = gpu_recv_queue.get() this_val_loss = (this_val_loss + that_val_loss) / 2. if private_config['flag_verbose']: print('epoch %i: validation loss %f ' % (epoch, this_val_loss)) print('epoch %i: validation error %f %%' % (epoch, this_val_error * 100.)) val_record.append([this_val_error, this_val_loss]) if private_config['flag_save']: np.save(config['weights_dir'] + 'val_record.npy', val_record) DropoutLayer.SetDropoutOn() ############################################ # Adapt Learning Rate step_idx = adjust_learning_rate(config, epoch, step_idx, val_record, learning_rate) # Save Weights, only one of them will do if private_config['flag_save']: if epoch % config['snapshot_freq'] == 0: save_weights(layers, config['weights_dir'], epoch) np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy', learning_rate.get_value()) save_momentums(vels, config['weights_dir'], epoch) print('Optimization complete.')
def fun_mlp(shared_args, private_args, this_queue, that_queue): ''' shared_args contains neural network parameters private_args contains parameters for process run on each gpu this_queue and that_queue are used for synchronization between processes. ''' learning_rate = shared_args['learning_rate'] n_epochs = shared_args['n_epochs'] dataset = shared_args['dataset'] batch_size = shared_args['batch_size'] L1_reg = shared_args['L1_reg'] L2_reg = shared_args['L2_reg'] n_hidden = shared_args['n_hidden'] #### # pycuda and zmq environment drv.init() dev = drv.Device(private_args['ind_gpu']) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) if private_args['flag_client']: sock.connect('tcp://localhost:5000') else: sock.bind('tcp://*:5000') #### #### # import theano related import theano.sandbox.cuda theano.sandbox.cuda.use(private_args['gpu']) import theano import theano.tensor as T from logistic_sgd import load_data from mlp import MLP import theano.misc.pycuda_init import theano.misc.pycuda_utils #### datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(1234) classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10) cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) gparams = [T.grad(cost, param) for param in classifier.params] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) #### # setting pycuda and # pass handles, only done once param_ga_list = [] # a list of pycuda gpuarrays which point to value of theano shared variable on this gpu param_other_list = [] # a list of theano shared variables that are used to store values of theano shared variable from the other gpu param_ga_other_list = [] # a list of pycuda gpuarrays which point to theano shared variables in param_other_list h_list = [] # a list of pycuda IPC handles shape_list = [] # a list containing shapes of variables in param_ga_list dtype_list = [] # a list containing dtypes of variables in param_ga_list average_fun_list = [] # a list containing theano functions for averaging parameters for param in classifier.params: param_other = theano.shared(param.get_value()) param_ga = \ theano.misc.pycuda_utils.to_gpuarray(param.container.value) param_ga_other = \ theano.misc.pycuda_utils.to_gpuarray( param_other.container.value) h = drv.mem_get_ipc_handle(param_ga.ptr) average_fun = \ theano.function([], updates=[(param, (param + param_other) / 2.)]) param_other_list.append(param_other) param_ga_list.append(param_ga) param_ga_other_list.append(param_ga_other) h_list.append(h) shape_list.append(param_ga.shape) dtype_list.append(param_ga.dtype) average_fun_list.append(average_fun) # pass shape, dtype and handles sock.send_pyobj((shape_list, dtype_list, h_list)) shape_other_list, dtype_other_list, h_other_list = sock.recv_pyobj() param_ga_remote_list = [] # create gpuarray point to the other gpu use the passed information for shape_other, dtype_other, h_other in zip(shape_other_list, dtype_other_list, h_other_list): param_ga_remote = \ gpuarray.GPUArray(shape_other, dtype_other, gpudata=drv.IPCMemoryHandle(h_other)) param_ga_remote_list.append(param_ga_remote) #### ############### # TRAIN MODEL # ############### print '... training' this_queue.put('') that_queue.get() start_time = time.time() epoch = 0 while epoch < n_epochs: epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): if minibatch_index % 2 == private_args['mod']: train_model(minibatch_index) this_queue.put('') that_queue.get() # exchanging weights for param_ga, param_ga_other, param_ga_remote in \ zip(param_ga_list, param_ga_other_list, param_ga_remote_list): drv.memcpy_peer( param_ga_other.ptr, param_ga_remote.ptr, param_ga_remote.dtype.itemsize * param_ga_remote.size, ctx, ctx) ctx.synchronize() this_queue.put('') that_queue.get() for average_fun in average_fun_list: average_fun() if private_args['verbose']: validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) end_time = time.time() this_queue.put('') that_queue.get() if private_args['verbose']: print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
def train_net(config, private_config): # UNPACK CONFIGS (flag_para_load, train_filenames, val_filenames, train_labels, val_labels, img_mean) = \ unpack_configs(config, ext_data=private_config['ext_data'], ext_label=private_config['ext_label']) gpu_send_queue = private_config['queue_gpu_send'] gpu_recv_queue = private_config['queue_gpu_recv'] # pycuda and zmq set up drv.init() dev = drv.Device(int(private_config['gpu'][-1])) ctx = dev.make_context() sock_gpu = zmq.Context().socket(zmq.PAIR) if private_config['flag_client']: sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter log_iter.write("%d\n" % num_iter) log_iter.flush() print 'training cost:', cost_ij log_err_cost.write("%f\n" % cost_ij) log_err_cost.flush() if config['print_train_error']: error_ij = train_error() gpu_send_queue.put(error_ij) that_error = gpu_recv_queue.get() error_ij = (error_ij + that_error) / 2. if private_config['flag_verbose']: print 'training error rate:', error_ij log_err_rate.write("%f\n" % error_ij) log_err_rate.flush() if flag_para_load and (count < len(minibatch_range)): load_send_queue.put('calc_finished') if count%20 == 0: e = time.time() print "time per 20 iter:", (e - s) ############### Test on Validation Set ################## DropoutLayer.SetDropoutOff() this_val_error, this_val_loss = get_val_error_loss( rand_arr, shared_x, shared_y, val_filenames, val_labels, flag_para_load, img_mean, batch_size, validate_model, send_queue=load_send_queue, recv_queue=load_recv_queue) # report validation stats gpu_send_queue.put(this_val_error) that_val_error = gpu_recv_queue.get() this_val_error = (this_val_error + that_val_error) / 2. gpu_send_queue.put(this_val_loss) that_val_loss = gpu_recv_queue.get() this_val_loss = (this_val_loss + that_val_loss) / 2. if private_config['flag_verbose']: print('epoch %i: validation loss %f ' % (epoch, this_val_loss)) print('epoch %i: validation error %f %%' % (epoch, this_val_error * 100.)) val_record.append([this_val_error, this_val_loss]) if private_config['flag_save']: np.save(config['weights_dir'] + 'val_record.npy', val_record) np.savetxt(config['weights_dir'] + 'val_record_txt.txt', val_record) DropoutLayer.SetDropoutOn() ############################################ # Adapt Learning Rate step_idx = adjust_learning_rate(config, epoch, step_idx, val_record, learning_rate) # Save Weights, only one of them will do if private_config['flag_save']: if epoch % config['snapshot_freq'] == 0: save_weights(layers, config['weights_dir'], epoch) np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy', learning_rate.get_value()) save_momentums(vels, config['weights_dir'], epoch) print('Optimization complete.')