示例#1
0
def fun_load(queue_dict):
    
    sock_data = queue_dict['sock_data']
    gpuid = int(queue_dict['device'][-1])
    send_queue = queue_dict['queue_l2t']
    recv_queue = queue_dict['queue_t2l']
    # recv_queue and send_queue are multiprocessing.Queue
    # recv_queue is only for receiving
    # send_queue is only for sending

    # if need to do random crop and mirror
    
    from lib.train_funcs import set_cpu_affi
    set_cpu_affi(gpuid)

    drv.init()
    dev = drv.Device(gpuid)
    ctx = dev.make_context()
    sock = zmq.Context().socket(zmq.PAIR)
    sock.bind('tcp://*:{0}'.format(sock_data))

    shape, dtype, h = sock.recv_pyobj()
    print '1. shared_x information received'

    gpu_data_remote = gpuarray.GPUArray(shape, dtype,
                                        gpudata=drv.IPCMemoryHandle(h))
    gpu_data = gpuarray.GPUArray(shape, dtype)

    img_mean = recv_queue.get()
    print '2. img_mean received'

    count=0
    import time
    while True:
        
        mode = recv_queue.get()
        print '3. mode received: %s' % mode
        
        filename_list = recv_queue.get()
        print '4. filename list received'

        for filename in filename_list:
            
            assert recv_queue.get() == 'load_file'

            data = hkl.load(str(filename)) - img_mean
            
            rand_arr = get_rand3d(True, int(time.time()*1000)%100)
            
            data = crop_and_mirror(data, rand_arr, flag_batch=True)

            gpu_data.set(data)
            
            # 5. wait for computation on last minibatch to finish  
            msg = recv_queue.get()
            assert msg == 'calc_finished'

            drv.memcpy_dtod(gpu_data_remote.ptr,
                            gpu_data.ptr,
                            gpu_data.dtype.itemsize *
                            gpu_data.size,
                            )

            ctx.synchronize()

            # 6. tell train proc to start train on this batch
            send_queue.put('copy_finished')
示例#2
0
def train_convnet(

    queue_dict,
    valid_sync=False,
    verbose = False
	
	):
    
    gpuid = int(queue_dict['device'][-1])
    from lib.train_funcs import set_cpu_affi
    set_cpu_affi(gpuid)

    worker = Worker(control_port=5567)

    # Load Model options
    model_options = locals().copy()
    
    import yaml
    with open('config.yaml', 'r') as f:
        training_config = yaml.load(f)   
    name=training_config['name']
    
    with open(name+'.yaml', 'r') as f:
        model_config = yaml.load(f)
    model_options = dict(model_options.items()+training_config.items()+model_config.items()+queue_dict.items())
                                         
    
    print "model options", model_options

    print 'Loading data'
    
    from lib.train_funcs import unpack_configs,proc_configs, get_rand3d, adjust_learning_rate
    proc_configs(model_options)
    train_len = model_options['avg_freq'] # Train for this many minibatches when requested
                                         
    (flag_para_load, flag_top_5,
            train_filenames, val_filenames, train_labels, val_labels, img_mean) = \
            unpack_configs(model_options, ext_data='.hkl', ext_label='.npy')
    
    #train_filenames = train_filenames[:8]
    
    #val_filenames = val_filenames[:4]
    print 'Building model'
    
    # shared_x should be created after driver initialization and before drv.mem_get_ipc_handle() is called, otherwise memhandle will be invalid
    drv = drv_init(queue_dict) 
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    tparams, model, drp = init_params(model_options)

    if model_options['resume_train']:
        load_epoch=model_options['load_epoch']
        load_model(load_epoch, layers, learning_rate, vels, \
        								path=model_options['load_path'])    

    worker.init_shared_params(tparams, param_sync_rule=EASGD(1.0/model_options['size'])) # Using alpha = 1/N

    print "Params init done"
    
    from lib.googlenet import get_shared_x_y,compile_model,compile_val
    shared_x_list, shared_y = get_shared_x_y(model_options)
    
    train_model, get_vel, descent_vel, params, vels,vels2, learning_rate = \
	 					compile_model(model, model_options,shared_x_list,shared_y)
                        
    val_model = compile_val(model, model_options,shared_x_list,shared_y)

    print 'Optimization'
                    
    # parallel data loading
    
    
    para_load_init(queue_dict, drv, shared_x_list[0],img_mean)
    
    para_train_it = p_iter(model_options, shared_y, train_filenames, \
                                    train_labels, train_model, 'train')
    para_val_it = p_iter(model_options, shared_y, val_filenames, \
                                            val_labels, val_model, 'val')

    best_p = None
    
    def print_time(amount, train_time_list,comm_time_list,wait_time_list):
        train,comm,wait = sum(train_time_list), sum(comm_time_list), sum (wait_time_list)
        print 'time per %d images: %.2f (train %.2f comm %.2f wait %.2f)' % \
                     (amount, train+comm+wait, train,comm,wait)
        return train+comm+wait, train,comm,wait

    count=0
    start_time = None
    
    import time
    inforec_list = []
    train_error_list = []
    val_error_list = []
    all_time_list = []
    epoch_time_list = []
    lr_list = []
    epoch=0
    step_idx = 0
    
    train_time_list = []
    wait_time_list = []
    comm_time_list = []
    
    while True:
        
        req_time= time.time()
        
        step = worker.send_req('next')
        
        #print step

        req_time = time.time() - req_time
        
        if step == 'train':
            
            if start_time==None:
                start_time = time.time()
 
            for i in xrange(train_len): # sync with server every train_len iter

                train_time, wait_time, cost, error, _ = next(para_train_it)  
                train_time_list.append(train_time)
                wait_time_list.append(wait_time)
                
                count+=1
                if (count) % (5120/model_options['file_batch_size']) ==0:
                    print ''
			        
                    print '%d %.4f %.4f'% (count, cost, error)
                    train_error_list.append([count, cost, error])
                    t_all,t_train,t_comm,t_wait = print_time(5120, train_time_list, comm_time_list, wait_time_list)
                    all_time_list.append([count,t_all,t_train,t_comm,t_wait])
                    train_time_list = []
                    wait_time_list =[]
                    comm_time_list = []
            
            comm_time = time.time()
            
            step = worker.send_req(dict(done=train_len))

            if verbose: print "Syncing"
            worker.sync_params(synchronous=True)
            
            comm_time_list.append(time.time() - comm_time + req_time)


        """
        if step.startswith('save '):
            _, saveto = step.split(' ', 1)
            print 'Saving...',
            # TODO fix that shit so that saving works.
            numpy.savez(saveto, history_errs=history_errs, **s.params)
            pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
            print 'Done'
        """

        if step == 'valid':
            
            if valid_sync:
                worker.copy_to_local()
                
            drp.SetDropoutOff()
            
            cost_list = []
            error_list = []
            error_top_5_list = []
            
            for i in xrange(len(val_filenames)):
            
                _, _, cost,error,error_top_5= next(para_val_it) 
          
                cost_list.append(cost)
                error_list.append(error)
                error_top_5_list.append(error_top_5)     
                
                print '.',
            print ''

            validation_loss = np.mean(cost_list)
            validation_error = np.mean(error_list)
            validation_error_top5 = np.mean(error_top_5_list)
            
            print 'validation cost:%.4f' % validation_loss
            print 'validation error:%.4f' % validation_error
            print 'validation top_5_error:%.4f' % validation_error_top5
            val_error_list.append([count, validation_loss, \
                        validation_error, validation_error_top5])

            drp.SetDropoutOn()

            res = worker.send_req(dict(test_err=float(validation_error),
                                       valid_err=float(validation_error)))

            if res == 'best':
                best_p = unzip(tparams)

            if valid_sync:
                worker.copy_to_local()
                
                
            # get total iterations processed by all workers
            uidx = worker.send_req('uidx')
            
            uepoch = int(uidx/len(train_filenames)) 

            if model.name=='alexnet':
                
                if model_options['lr_policy'] == 'step':
                    
                    if uepoch >=20 and uepoch < 40 and step_idx==0:

                        learning_rate.set_value(
                            np.float32(learning_rate.get_value() / 10))
                        print 'Learning rate divided by 10'
                        step_idx = 1
                        
                    elif uepoch >=40 and uepoch < 60 and step_idx==1:
                        
                        learning_rate.set_value(
                            np.float32(learning_rate.get_value() / 10))
                        print 'Learning rate divided by 10'
                        step_idx = 2
                        
                    elif uepoch >=60 and uepoch < 70 and step_idx==2:
                        
                        learning_rate.set_value(
                            np.float32(learning_rate.get_value() / 10))
                        print 'Learning rate divided by 10'
                        step_idx = 3
                    else:
                        pass


                if model_options['lr_policy'] == 'auto':
                    if uepoch>5 and (val_error_list[-3][2] - val_error_list[-1][2] <
                                        model_options['lr_adapt_threshold']):
                        learning_rate.set_value(
                            np.float32(learning_rate.get_value() / 10))
                       
                        
                lr = learning_rate.get_value()
                lr = np.float32(lr)
                          
            elif model.name=='googlenet':

                    # Poly lr policy according to
	                # https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet
                    max_iter = len(train_filenames)*240
                    lr = learning_rate.get_value() * \
	                    pow( (1. -  1.* uepoch*len(train_filenames) / max_iter), 0.5 )
                    lr = np.float32(lr)
                    learning_rate.set_value(lr)

            else:
                raise NotImplementedError
                
            print 'Learning rate now:', lr
				
            lr_list.append(lr)
	            
            if start_time!=None:
                epoch_time_list.append([count , time.time()-start_time])
                epoch = int(count/len(train_filenames) )
                print 'epoch %d time %.2fh, global epoch is %d' % (epoch, epoch_time_list[-1][1]/3600.0, uepoch)
                
                inforec_list = [train_error_list,
                                val_error_list,
                                all_time_list,
                                epoch_time_list,
                                lr_list
                                ]
                
                import pickle
                filepath = '../run/inforec/inforec_%s.pkl' % queue_dict['device']
                with open(filepath, 'wb') as f:
                    pickle.dump(inforec_list, f, protocol=pickle.HIGHEST_PROTOCOL)
            
            start_time=None

        if step == 'stop':
            break

    # Release all shared ressources.
    worker.close()