Exemplo n.º 1
0
class EASGD_PTWorker(PTWorker):
    
    '''
    Worker class based on a specific synchronization rule (EASGD)
    Executing training routine and periodically reporting results to server
    
    '''
    
    def __init__(self, port, config, device):        
        PTWorker.__init__(self, port = port, \
                                config = config, \
                                device = device)
                                
        self.worker_id = self.config['worker_id']
        
        if self.config['sync_start']:
            # sync start register, 
            # use the COMM_WORLD to communicate with server
            self._MPI_register()
            self.model.verbose = self.verbose 
        else:
            # async start register, 
            # build a separate intercomm to communicate with server
            self.MPI_register()
            self.model.verbose = self.verbose
            
        #if self.verbose: print 'worker registered'
                             
        self.prepare_worker()                        
        self.prepare_recorder()
        self.prepare_iterator()
        
        self.uepoch = None
        if self.config['resume_train'] == True:
            self.uepoch = self.config['load_epoch']
            self.load_model(self.uepoch)

        self.train_len = self.config['avg_freq']
        self.val_len = len(self.data[2])
        self.mode = None
        self.lastmode = None
        self.count = 0
        
        if self.verbose:
            self.rec_name = 'inforec.pkl'
        else:
            self.rec_name = 'inforec_'+ str(self.worker_id) + '.pkl'
        
    def prepare_param_exchanger(self):
        
        from base.exchanger import EASGD_Exchanger

        self.exchanger = EASGD_Exchanger(self.config, \
                                    self.drv, \
                                    self.model.params, \
                                    etype='worker')
                                    
    def prepare_recorder(self):
        
        from base.recorder import Recorder
        
        self.recorder = Recorder(self.config)
                                    
    def prepare_iterator(self):
        
        from base.iterator import P_iter
        
        # iterator won't make another copy of the model 
        # instead it will just call its compiled train function
        
        self.train_iterator = P_iter(self.config, self.model, \
                                    self.data[0], self.data[1],  'train')
        self.val_iterator = P_iter(self.config, self.model, \
                                    self.data[2], self.data[3], 'val')
                                    
                                    
    def load_model(self, load_epoch):
        
        layers = self.model.layers
        path = self.config['load_path']
        s_lr = self.model.shared_lr
        vels = self.model.vels

        
        # TODO needs to verify the previous lr is when training with avg, scaled by size
        import os  
        s_lr.set_value(np.load(os.path.join(path, 
                  'lr_' + str(load_epoch) + '.npy')))
        
        from base.helper_funcs import load_weights, load_momentums
        load_weights(layers, path, load_epoch)
        #load_momentums(vels, path, load_epoch)
            
        if self.verbose: 
            print '\nlearning rate loaded %f' % s_lr.get_value()
            print 'weights and momentums loaded from epoch %d in %s' % (load_epoch,path)
            
            record_file_path = self.config['record_dir'] + 'inforec.pkl' # bug which worker's inforec should be used, use only recording worker's, if exist, put into history
            if os.path.exists(record_file_path):
                import glob
                history_folder = self.config['record_dir']+ 'history*' 
                find = glob.glob(history_folder)
                #print find
                if find != []:
                    history_folder = sorted(find)[-1]
                    #print history_folder

                    history_folder = history_folder.split('_')[0] + '_' + \
                             "%d" % (int(history_folder.split('_')[-1])+1) + '/'
                    
                else:
                    history_folder = self.config['record_dir']+ 'history_0' + '/'
                
                print 'creating inforec history folder: ' + history_folder
                    
                os.makedirs(history_folder)
                import shutil
                shutil.copy(record_file_path, history_folder+'inforec.pkl')
                self.recorder.load(filepath = record_file_path)
                # print type(self.recorder.info_dict['train_info'])
                # print len(self.recorder.info_dict['train_info'])
                #
                # print type(self.recorder.info_dict['val_info'])
                # print len(self.recorder.info_dict['val_info'])
            
            else:
                raise OSError('record fle not found at %s ' % record_file_path)
                
            
    def save_model(self): 
        
        assert self.uepoch != None
        layers = self.model.layers
        path = self.config['weights_dir']
        vels = self.model.vels  
        
        from base.helper_funcs import save_weights, save_momentums
        save_weights(layers, path, self.uepoch)
        np.save(path + 'lr_' + str(self.uepoch) + \
                        '.npy', self.model.shared_lr.get_value())
        #save_momentums(vels,self.config['weights_dir'], self.uepoch)
        if self.verbose:
            print '\nweights and momentums saved at epoch %d' % self.uepoch
            
    def train(self):
        
        for i in range(self.train_len):
            
            for subb_ind in range(self.config['n_subb']):
                #print self.count
                self.train_iterator.next(self.recorder,self.count)
            
            self.count += 1
            self.recorder.print_train_info(self.count)

            
        self.recorder.start()
        reply = self.request(dict(done=self.train_len))
        
        self.exchanger.comm = self.intercomm
        self.action(message = 'exchange', \
                    action=self.exchanger.exchange)
        self.recorder.end('comm')
        
        self.lastmode = 'train'

        
    def val(self):
        
        if self.lastmode == 'train':
            self.train_iterator.reset()
        
        self.model.set_dropout_off()
        
        for i in range(self.val_len):
        
            self.val_iterator.next(self.recorder,self.count)
            
            if self.verbose: print '.',
        
        self.recorder.print_val_info(self.count)
        
        self.model.set_dropout_on()
        
        self.val_iterator.reset()
                                    
    
    def copy_to_local(self):
        
        self.exchanger.comm = self.intercomm
        self.action(message = 'copy_to_local', \
                    action=self.exchanger.copy_to_local)
        if self.verbose: print '\nSynchronized param with server'
                    
    def adjust_lr(self):
        
        self.uepoch, self.n_workers = self.request('uepoch')
        
        #if self.verbose: print 'global epoch %d, %d workers online' % (self.uepoch, self.n_workers )
        
        self.model.adjust_lr(self.uepoch, size = self.n_workers)
        
        
    def run(self):
        
        # override PTWorker class method
        
        if self.verbose: print 'worker %s started' % self.worker_id
        
        self.prepare_param_exchanger()
        
        # start training with the most recent server parameter
        self.copy_to_local()
        
        self.adjust_lr()
                    
        epoch_start = False
        

        while True:

            self.mode = self.request('next')
            
            #print self.mode

            if self.mode == 'train':
                
                if epoch_start == False:
                    self.recorder.start_epoch()
                    epoch_start = True
                    if self.verbose: 
                        print '\nNow training'

                self.train()
                
            if self.mode == 'adjust_lr':
                
                self.adjust_lr()
                #self.copy_to_local()

            if self.mode == 'val':

                if self.verbose: 
                    print '\nNow validating'
                
                self.copy_to_local()

                self.val()
                
                self.recorder.save(self.count, self.model.shared_lr.get_value(), \
                        filepath = self.config['record_dir'] + self.rec_name)
                        
                self.uepoch, self.n_workers = self.request('uepoch')
                
                if self.uepoch % self.config['snapshot_freq'] == 0: # TODO BUG: if too few images in training set, uepoch may skip more than 1 per check
                    self.save_model()
                
                self.copy_to_local()
                
                if epoch_start == True:
                    self.recorder.end_epoch(self.count, self.uepoch)
                    epoch_start = False
                        
            if self.mode == 'stop':
                
                self.copy_to_local()

                self.val()
                
                if epoch_start == True:
                    self.recorder.end_epoch(self.count, self.uepoch)
                    epoch_start = False
                
                if self.verbose: print '\nOptimization finished'
                
                break
        
        self.para_load_close() # TODO some workers blocked here can't disconnect
        self.ctx.pop()
        self.MPI_deregister()
        
        if self.verbose: print '\nWorker %s deregistered' % self.worker_id
Exemplo n.º 2
0
class Async_PTWorker(Client,PTWorker):
    
    '''
    Asynchronous Worker class
    
    '''
    
    def __init__(self, port, config, device):
        Client.__init__(self, port = port)        
        PTWorker.__init__(self, config = config, \
                                device = device)
                                
        if self.config['sync_start']:
            self.config['size'] = 1
                                
        self.config['worker_id'] = self.worker_id
        
        if self.config['sync_start']:
            # sync start register, 
            # use the COMM_WORLD to communicate with server
            self._MPI_register()
            self.model.verbose = self.verbose 
        else:
            # async start register, 
            # build a separate intercomm to communicate with server
            self.MPI_register()
            self.model.verbose = self.verbose
        
        self.train_len = self.config['sync_freq']  # need to be 1 for asgd
        self.val_len = len(self.data[2])
        self.mode = None
        self.lastmode = None
        self.count = 0
        
        if self.verbose:
            self.rec_name = 'inforec.pkl'
        else:
            self.rec_name = 'inforec_'+ str(self.worker_id) + '.pkl'


            
    def MPI_register(self):
        
        # async start register, 
        # build a separate intercomm to communicate with server
        
        first = self.request('connect')
        
        # self.verbose = (first == 'first')
        
        info = MPI.INFO_NULL
        
        service = 'parallel-training'
        
        port = MPI.Lookup_name(service, info)
        
        self.intercomm = MPI.COMM_WORLD.Connect(port, info, root=0)

        self.config['irank'] = self.intercomm.rank 
        # size on the local side
        self.config['isize'] = self.intercomm.size 
        # size on the remote side
        self.config['iremotesize'] = self.intercomm.remote_size
        
        test_intercomm(self.intercomm, rank=1)
    
    def _MPI_register(self):
        
        # sync start register, 
        # use the COMM_WORLD to communicate with server
        
        first = self.request('sync_register')
        
        self.verbose = (first == 'first')
        self.config['verbose'] = self.verbose
        
        self.intercomm = self.comm
        
        self.comm.send(int(self.rank), dest=0, tag = int(self.worker_id))
        
        self.config['irank'] = self.intercomm.rank
        
        self.config['isize'] = self.intercomm.size 
        
    def MPI_deregister(self):
        
        self.request('disconnect')
        
        try:
            self.intercomm.Disconnect()
        except:
            pass
    
        
    def prepare_param_exchanger(self):
        
        # different in EASGD and ASGD
        
        pass
                                    
    def prepare_recorder(self):
        
        from base.recorder import Recorder
        
        self.recorder = Recorder(self.config)
                                    
    def prepare_iterator(self):
        
        # different in EASGD and ASGD
        pass
                                    
                                    
    def load_model(self, load_epoch):
        
        layers = self.model.layers
        path = self.config['load_path']
        s_lr = self.model.shared_lr
        vels = self.model.vels

        
        # TODO needs to verify the previous lr is when training with avg, scaled by size
        import os  
        s_lr.set_value(np.load(os.path.join(path, 
                  'lr_' + str(load_epoch) + '.npy')))
        
        from base.helper_funcs import load_weights, load_momentums
        load_weights(layers, path, load_epoch)
        #load_momentums(vels, path, load_epoch)
            
        if self.verbose: 
            print '\nlearning rate loaded %f' % s_lr.get_value()
            print 'weights and momentums loaded from epoch %d in %s' % (load_epoch,path)
            
            record_file_path = self.config['record_dir'] + 'inforec.pkl' # bug which worker's inforec should be used, use only recording worker's, if exist, put into history
            if os.path.exists(record_file_path):
                import glob
                history_folder = self.config['record_dir']+ 'history*' 
                find = glob.glob(history_folder)
                #print find
                if find != []:
                    history_folder = sorted(find)[-1]
                    #print history_folder

                    history_folder = history_folder.split('_')[0] + '_' + \
                             "%d" % (int(history_folder.split('_')[-1])+1) + '/'
                    
                else:
                    history_folder = self.config['record_dir']+ 'history_0' + '/'
                
                print 'creating inforec history folder: ' + history_folder
                    
                os.makedirs(history_folder)
                import shutil
                shutil.copy(record_file_path, history_folder+'inforec.pkl')
                self.recorder.load(filepath = record_file_path)
                # print type(self.recorder.info_dict['train_info'])
                # print len(self.recorder.info_dict['train_info'])
                #
                # print type(self.recorder.info_dict['val_info'])
                # print len(self.recorder.info_dict['val_info'])
            
            else:
                raise OSError('record fle not found at %s ' % record_file_path)
                
            
    def save_model(self): 
        
        assert self.uepoch != None
        layers = self.model.layers
        path = self.config['weights_dir']
        vels = self.model.vels  
        
        from base.helper_funcs import save_weights, save_momentums
        save_weights(layers, path, self.uepoch)
        np.save(path + 'lr_' + str(self.uepoch) + \
                        '.npy', self.model.shared_lr.get_value())
        #save_momentums(vels,self.config['weights_dir'], self.uepoch)
        if self.verbose:
            print '\nweights and momentums saved at epoch %d' % self.uepoch
            
    def train(self):
        
        for i in range(self.train_len):
            
            for subb_ind in range(self.config['n_subb']):
                #print self.count
                self.train_iterator.next(self.recorder,self.count)
            
            self.count += 1
            self.recorder.print_train_info(self.count)

            
        self.recorder.start()
        reply = self.request(dict(done=self.train_len))
        
        self.exchanger.comm = self.intercomm
        self.action(message = 'exchange', \
                    action=self.exchanger.exchange)
        self.recorder.end('comm')
        
        self.lastmode = 'train'

        
    def val(self):
        
        if self.lastmode == 'train':
            self.train_iterator.reset()
        
        self.model.set_dropout_off()
        
        for i in range(self.val_len):
        
            self.val_iterator.next(self.recorder,self.count)
            
            if self.verbose: print '.',
        
        self.recorder.print_val_info(self.count)
        
        self.model.set_dropout_on()
        
        self.val_iterator.reset()
                                    
    
    def copy_to_local(self):
        
        self.exchanger.comm = self.intercomm
        self.action(message = 'copy_to_local', \
                    action=self.exchanger.copy_to_local)
        if self.verbose: print '\nSynchronized param with server'
                    
    def adjust_lr(self):
        
        self.uepoch, self.n_workers = self.request('uepoch')
        
        #if self.verbose: print 'global epoch %d, %d workers online' % (self.uepoch, self.n_workers )
        
        self.model.adjust_lr(self.uepoch, size = self.n_workers)
    
        if self.verbose: 
            print 'Learning rate now: %.10f' % \
                    np.float32(self.model.shared_lr.get_value())
        
        
    def run(self):
        
        # override PTWorker class method
        
        if self.verbose: print 'worker %s started' % self.worker_id
        
        self.prepare_param_exchanger()
        
        # start training with the most recent server parameter
        self.copy_to_local()
        
        self.adjust_lr()
                    
        epoch_start = False
        

        while True:

            self.mode = self.request('next')
            
            #print self.mode

            if self.mode == 'train':
                
                if epoch_start == False:
                    self.recorder.start_epoch()
                    epoch_start = True
                    if self.verbose: 
                        print '\nNow training'

                self.train()
                
            if self.mode == 'adjust_lr':
                
                self.adjust_lr()
                #self.copy_to_local()

            if self.mode == 'val':

                if self.verbose: 
                    print '\nNow validating'
                
                self.copy_to_local()

                self.val()
                
                self.recorder.save(self.count, self.model.shared_lr.get_value(), \
                        filepath = self.config['record_dir'] + self.rec_name)
                        
                self.uepoch, self.n_workers = self.request('uepoch')
                
                if self.uepoch % self.config['snapshot_freq'] == 0: # TODO BUG: if too few images in training set, uepoch may skip more than 1 per check
                    self.save_model()
                
                self.copy_to_local()
                
                if epoch_start == True:
                    self.recorder.end_epoch(self.count, self.uepoch)
                    epoch_start = False
                        
            if self.mode == 'stop':
                
                self.copy_to_local()

                self.val()
                
                if epoch_start == True:
                    self.recorder.end_epoch(self.count, self.uepoch)
                    epoch_start = False
                
                if self.verbose: print '\nOptimization finished'
                
                break
        
        self.para_load_close() # TODO some workers blocked here can't disconnect
        self.ctx.pop()
        self.MPI_deregister()
        
        if self.verbose: print '\nWorker %s deregistered' % self.worker_id
Exemplo n.º 3
0
class EASGD_PTWorker(PTWorker):
    '''
    Worker class based on a specific synchronization rule (EASGD)
    Executing training routine and periodically reporting results to server
    
    '''
    def __init__(self, port, config, device):
        PTWorker.__init__(self, port = port, \
                                config = config, \
                                device = device)

        self.worker_id = self.config['worker_id']

        if self.config['sync_start']:
            # sync start register,
            # use the COMM_WORLD to communicate with server
            self._MPI_register()
            self.model.verbose = self.verbose
        else:
            # async start register,
            # build a separate intercomm to communicate with server
            self.MPI_register()
            self.model.verbose = self.verbose

        #if self.verbose: print 'worker registered'

        self.prepare_worker()
        self.prepare_recorder()
        self.prepare_iterator()

        self.uepoch = None
        if self.config['resume_train'] == True:
            self.uepoch = self.config['load_epoch']
            self.load_model(self.uepoch)

        self.train_len = self.config['avg_freq']
        self.val_len = len(self.data[2])
        self.mode = None
        self.lastmode = None
        self.count = 0

        if self.verbose:
            self.rec_name = 'inforec.pkl'
        else:
            self.rec_name = 'inforec_' + str(self.worker_id) + '.pkl'

    def prepare_param_exchanger(self):

        from base.exchanger import EASGD_Exchanger

        self.exchanger = EASGD_Exchanger(self.config, \
                                    self.drv, \
                                    self.model.params, \
                                    etype='worker')

    def prepare_recorder(self):

        from base.recorder import Recorder

        self.recorder = Recorder(self.config)

    def prepare_iterator(self):

        from base.iterator import P_iter

        # iterator won't make another copy of the model
        # instead it will just call its compiled train function

        self.train_iterator = P_iter(self.config, self.model, \
                                    self.data[0], self.data[1],  'train')
        self.val_iterator = P_iter(self.config, self.model, \
                                    self.data[2], self.data[3], 'val')

    def load_model(self, load_epoch):

        layers = self.model.layers
        path = self.config['load_path']
        s_lr = self.model.shared_lr
        vels = self.model.vels

        # TODO needs to verify the previous lr is when training with avg, scaled by size
        import os
        s_lr.set_value(
            np.load(os.path.join(path, 'lr_' + str(load_epoch) + '.npy')))

        from base.helper_funcs import load_weights, load_momentums
        load_weights(layers, path, load_epoch)
        #load_momentums(vels, path, load_epoch)

        if self.verbose:
            print '\nlearning rate loaded %f' % s_lr.get_value()
            print 'weights and momentums loaded from epoch %d in %s' % (
                load_epoch, path)

            record_file_path = self.config[
                'record_dir'] + 'inforec.pkl'  # bug which worker's inforec should be used, use only recording worker's, if exist, put into history
            if os.path.exists(record_file_path):
                import glob
                history_folder = self.config['record_dir'] + 'history*'
                find = glob.glob(history_folder)
                #print find
                if find != []:
                    history_folder = sorted(find)[-1]
                    #print history_folder

                    history_folder = history_folder.split('_')[0] + '_' + \
                             "%d" % (int(history_folder.split('_')[-1])+1) + '/'

                else:
                    history_folder = self.config[
                        'record_dir'] + 'history_0' + '/'

                print 'creating inforec history folder: ' + history_folder

                os.makedirs(history_folder)
                import shutil
                shutil.copy(record_file_path, history_folder + 'inforec.pkl')
                self.recorder.load(filepath=record_file_path)
                # print type(self.recorder.info_dict['train_info'])
                # print len(self.recorder.info_dict['train_info'])
                #
                # print type(self.recorder.info_dict['val_info'])
                # print len(self.recorder.info_dict['val_info'])

            else:
                raise OSError('record fle not found at %s ' % record_file_path)

    def save_model(self):

        assert self.uepoch != None
        layers = self.model.layers
        path = self.config['weights_dir']
        vels = self.model.vels

        from base.helper_funcs import save_weights, save_momentums
        save_weights(layers, path, self.uepoch)
        np.save(path + 'lr_' + str(self.uepoch) + \
                        '.npy', self.model.shared_lr.get_value())
        #save_momentums(vels,self.config['weights_dir'], self.uepoch)
        if self.verbose:
            print '\nweights and momentums saved at epoch %d' % self.uepoch

    def train(self):

        for i in range(self.train_len):

            for subb_ind in range(self.config['n_subb']):
                #print self.count
                self.train_iterator.next(self.recorder, self.count)

            self.count += 1
            self.recorder.print_train_info(self.count)

        self.recorder.start()
        reply = self.request(dict(done=self.train_len))

        self.exchanger.comm = self.intercomm
        self.action(message = 'exchange', \
                    action=self.exchanger.exchange)
        self.recorder.end('comm')

        self.lastmode = 'train'

    def val(self):

        if self.lastmode == 'train':
            self.train_iterator.reset()

        self.model.set_dropout_off()

        for i in range(self.val_len):

            self.val_iterator.next(self.recorder, self.count)

            if self.verbose: print '.',

        self.recorder.print_val_info(self.count)

        self.model.set_dropout_on()

        self.val_iterator.reset()

    def copy_to_local(self):

        self.exchanger.comm = self.intercomm
        self.action(message = 'copy_to_local', \
                    action=self.exchanger.copy_to_local)
        if self.verbose: print '\nSynchronized param with server'

    def adjust_lr(self):

        self.uepoch, self.n_workers = self.request('uepoch')

        #if self.verbose: print 'global epoch %d, %d workers online' % (self.uepoch, self.n_workers )

        self.model.adjust_lr(self.uepoch, size=self.n_workers)

    def run(self):

        # override PTWorker class method

        if self.verbose: print 'worker %s started' % self.worker_id

        self.prepare_param_exchanger()

        # start training with the most recent server parameter
        self.copy_to_local()

        self.adjust_lr()

        epoch_start = False

        while True:

            self.mode = self.request('next')

            #print self.mode

            if self.mode == 'train':

                if epoch_start == False:
                    self.recorder.start_epoch()
                    epoch_start = True
                    if self.verbose:
                        print '\nNow training'

                self.train()

            if self.mode == 'adjust_lr':

                self.adjust_lr()
                #self.copy_to_local()

            if self.mode == 'val':

                if self.verbose:
                    print '\nNow validating'

                self.copy_to_local()

                self.val()

                self.recorder.save(self.count, self.model.shared_lr.get_value(), \
                        filepath = self.config['record_dir'] + self.rec_name)

                self.uepoch, self.n_workers = self.request('uepoch')

                if self.uepoch % self.config[
                        'snapshot_freq'] == 0:  # TODO BUG: if too few images in training set, uepoch may skip more than 1 per check
                    self.save_model()

                self.copy_to_local()

                if epoch_start == True:
                    self.recorder.end_epoch(self.count, self.uepoch)
                    epoch_start = False

            if self.mode == 'stop':

                self.copy_to_local()

                self.val()

                if epoch_start == True:
                    self.recorder.end_epoch(self.count, self.uepoch)
                    epoch_start = False

                if self.verbose: print '\nOptimization finished'

                break

        self.para_load_close(
        )  # TODO some workers blocked here can't disconnect
        self.ctx.pop()
        self.MPI_deregister()

        if self.verbose: print '\nWorker %s deregistered' % self.worker_id