def fun_load(queue_dict): sock_data = queue_dict['sock_data'] gpuid = int(queue_dict['device'][-1]) send_queue = queue_dict['queue_l2t'] recv_queue = queue_dict['queue_t2l'] # recv_queue and send_queue are multiprocessing.Queue # recv_queue is only for receiving # send_queue is only for sending # if need to do random crop and mirror from lib.train_funcs import set_cpu_affi set_cpu_affi(gpuid) drv.init() dev = drv.Device(gpuid) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) sock.bind('tcp://*:{0}'.format(sock_data)) shape, dtype, h = sock.recv_pyobj() print '1. shared_x information received' gpu_data_remote = gpuarray.GPUArray(shape, dtype, gpudata=drv.IPCMemoryHandle(h)) gpu_data = gpuarray.GPUArray(shape, dtype) img_mean = recv_queue.get() print '2. img_mean received' count=0 import time while True: mode = recv_queue.get() print '3. mode received: %s' % mode filename_list = recv_queue.get() print '4. filename list received' for filename in filename_list: assert recv_queue.get() == 'load_file' data = hkl.load(str(filename)) - img_mean rand_arr = get_rand3d(True, int(time.time()*1000)%100) data = crop_and_mirror(data, rand_arr, flag_batch=True) gpu_data.set(data) # 5. wait for computation on last minibatch to finish msg = recv_queue.get() assert msg == 'calc_finished' drv.memcpy_dtod(gpu_data_remote.ptr, gpu_data.ptr, gpu_data.dtype.itemsize * gpu_data.size, ) ctx.synchronize() # 6. tell train proc to start train on this batch send_queue.put('copy_finished')
def train_convnet( queue_dict, valid_sync=False, verbose = False ): gpuid = int(queue_dict['device'][-1]) from lib.train_funcs import set_cpu_affi set_cpu_affi(gpuid) worker = Worker(control_port=5567) # Load Model options model_options = locals().copy() import yaml with open('config.yaml', 'r') as f: training_config = yaml.load(f) name=training_config['name'] with open(name+'.yaml', 'r') as f: model_config = yaml.load(f) model_options = dict(model_options.items()+training_config.items()+model_config.items()+queue_dict.items()) print "model options", model_options print 'Loading data' from lib.train_funcs import unpack_configs,proc_configs, get_rand3d, adjust_learning_rate proc_configs(model_options) train_len = model_options['avg_freq'] # Train for this many minibatches when requested (flag_para_load, flag_top_5, train_filenames, val_filenames, train_labels, val_labels, img_mean) = \ unpack_configs(model_options, ext_data='.hkl', ext_label='.npy') #train_filenames = train_filenames[:8] #val_filenames = val_filenames[:4] print 'Building model' # shared_x should be created after driver initialization and before drv.mem_get_ipc_handle() is called, otherwise memhandle will be invalid drv = drv_init(queue_dict) # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray tparams, model, drp = init_params(model_options) if model_options['resume_train']: load_epoch=model_options['load_epoch'] load_model(load_epoch, layers, learning_rate, vels, \ path=model_options['load_path']) worker.init_shared_params(tparams, param_sync_rule=EASGD(1.0/model_options['size'])) # Using alpha = 1/N print "Params init done" from lib.googlenet import get_shared_x_y,compile_model,compile_val shared_x_list, shared_y = get_shared_x_y(model_options) train_model, get_vel, descent_vel, params, vels,vels2, learning_rate = \ compile_model(model, model_options,shared_x_list,shared_y) val_model = compile_val(model, model_options,shared_x_list,shared_y) print 'Optimization' # parallel data loading para_load_init(queue_dict, drv, shared_x_list[0],img_mean) para_train_it = p_iter(model_options, shared_y, train_filenames, \ train_labels, train_model, 'train') para_val_it = p_iter(model_options, shared_y, val_filenames, \ val_labels, val_model, 'val') best_p = None def print_time(amount, train_time_list,comm_time_list,wait_time_list): train,comm,wait = sum(train_time_list), sum(comm_time_list), sum (wait_time_list) print 'time per %d images: %.2f (train %.2f comm %.2f wait %.2f)' % \ (amount, train+comm+wait, train,comm,wait) return train+comm+wait, train,comm,wait count=0 start_time = None import time inforec_list = [] train_error_list = [] val_error_list = [] all_time_list = [] epoch_time_list = [] lr_list = [] epoch=0 step_idx = 0 train_time_list = [] wait_time_list = [] comm_time_list = [] while True: req_time= time.time() step = worker.send_req('next') #print step req_time = time.time() - req_time if step == 'train': if start_time==None: start_time = time.time() for i in xrange(train_len): # sync with server every train_len iter train_time, wait_time, cost, error, _ = next(para_train_it) train_time_list.append(train_time) wait_time_list.append(wait_time) count+=1 if (count) % (5120/model_options['file_batch_size']) ==0: print '' print '%d %.4f %.4f'% (count, cost, error) train_error_list.append([count, cost, error]) t_all,t_train,t_comm,t_wait = print_time(5120, train_time_list, comm_time_list, wait_time_list) all_time_list.append([count,t_all,t_train,t_comm,t_wait]) train_time_list = [] wait_time_list =[] comm_time_list = [] comm_time = time.time() step = worker.send_req(dict(done=train_len)) if verbose: print "Syncing" worker.sync_params(synchronous=True) comm_time_list.append(time.time() - comm_time + req_time) """ if step.startswith('save '): _, saveto = step.split(' ', 1) print 'Saving...', # TODO fix that shit so that saving works. numpy.savez(saveto, history_errs=history_errs, **s.params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print 'Done' """ if step == 'valid': if valid_sync: worker.copy_to_local() drp.SetDropoutOff() cost_list = [] error_list = [] error_top_5_list = [] for i in xrange(len(val_filenames)): _, _, cost,error,error_top_5= next(para_val_it) cost_list.append(cost) error_list.append(error) error_top_5_list.append(error_top_5) print '.', print '' validation_loss = np.mean(cost_list) validation_error = np.mean(error_list) validation_error_top5 = np.mean(error_top_5_list) print 'validation cost:%.4f' % validation_loss print 'validation error:%.4f' % validation_error print 'validation top_5_error:%.4f' % validation_error_top5 val_error_list.append([count, validation_loss, \ validation_error, validation_error_top5]) drp.SetDropoutOn() res = worker.send_req(dict(test_err=float(validation_error), valid_err=float(validation_error))) if res == 'best': best_p = unzip(tparams) if valid_sync: worker.copy_to_local() # get total iterations processed by all workers uidx = worker.send_req('uidx') uepoch = int(uidx/len(train_filenames)) if model.name=='alexnet': if model_options['lr_policy'] == 'step': if uepoch >=20 and uepoch < 40 and step_idx==0: learning_rate.set_value( np.float32(learning_rate.get_value() / 10)) print 'Learning rate divided by 10' step_idx = 1 elif uepoch >=40 and uepoch < 60 and step_idx==1: learning_rate.set_value( np.float32(learning_rate.get_value() / 10)) print 'Learning rate divided by 10' step_idx = 2 elif uepoch >=60 and uepoch < 70 and step_idx==2: learning_rate.set_value( np.float32(learning_rate.get_value() / 10)) print 'Learning rate divided by 10' step_idx = 3 else: pass if model_options['lr_policy'] == 'auto': if uepoch>5 and (val_error_list[-3][2] - val_error_list[-1][2] < model_options['lr_adapt_threshold']): learning_rate.set_value( np.float32(learning_rate.get_value() / 10)) lr = learning_rate.get_value() lr = np.float32(lr) elif model.name=='googlenet': # Poly lr policy according to # https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet max_iter = len(train_filenames)*240 lr = learning_rate.get_value() * \ pow( (1. - 1.* uepoch*len(train_filenames) / max_iter), 0.5 ) lr = np.float32(lr) learning_rate.set_value(lr) else: raise NotImplementedError print 'Learning rate now:', lr lr_list.append(lr) if start_time!=None: epoch_time_list.append([count , time.time()-start_time]) epoch = int(count/len(train_filenames) ) print 'epoch %d time %.2fh, global epoch is %d' % (epoch, epoch_time_list[-1][1]/3600.0, uepoch) inforec_list = [train_error_list, val_error_list, all_time_list, epoch_time_list, lr_list ] import pickle filepath = '../run/inforec/inforec_%s.pkl' % queue_dict['device'] with open(filepath, 'wb') as f: pickle.dump(inforec_list, f, protocol=pickle.HIGHEST_PROTOCOL) start_time=None if step == 'stop': break # Release all shared ressources. worker.close()