def __init__(self, opt, shared=None): # Must call _get_init_model() first so that paths are updated if necessary # (e.g., a .dict file) init_model, is_finetune = self._get_init_model(opt, shared) opt['rank_candidates'] = True super().__init__(opt, shared) if shared: states = None else: # Note: we cannot change the type of metrics ahead of time, so you # should correctly initialize to floats or ints here self.metrics['loss'] = 0.0 self.metrics['examples'] = 0 self.metrics['rank'] = 0.0 self.metrics['mrr'] = 0.0 self.metrics['train_accuracy'] = 0.0 self.criterion = self.build_criterion() self.model = self.build_model() if self.model is None or self.criterion is None: raise AttributeError( 'build_model() and build_criterion() need to return the model or criterion' ) if self.use_cuda: self.model.cuda() self.criterion.cuda() if self.fp16: self.model = self.model.half() if init_model: print('Loading existing model parameters from ' + init_model) states = self.load(init_model) else: states = {} self.rank_top_k = opt.get('rank_top_k', -1) # Vectorize and save fixed/vocab candidates once upfront if applicable self.set_fixed_candidates(shared) self.set_vocab_candidates(shared) if shared: # We don't use get here because hasattr is used on optimizer later. if 'optimizer' in shared: self.optimizer = shared['optimizer'] else: optim_params = [ p for p in self.model.parameters() if p.requires_grad ] self.init_optim(optim_params, states.get('optimizer'), states.get('optimizer_type')) self.build_lr_scheduler(states, hard_reset=is_finetune) if shared is None and is_distributed(): self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[self.opt['gpu']], broadcast_buffers=False)
def __init__(self, opt, shared=None): # Must call _get_model_file() first so that paths are updated if necessary # (e.g., a .dict file) init_model, _ = self._get_init_model(opt, shared) opt['rank_candidates'] = True super().__init__(opt, shared) if shared: self.model = shared['model'] self.metrics = shared['metrics'] self.fixed_candidates = shared['fixed_candidates'] self.fixed_candidate_vecs = shared['fixed_candidate_vecs'] self.vocab_candidates = shared['vocab_candidates'] self.vocab_candidate_vecs = shared['vocab_candidate_vecs'] states = None else: self.metrics = { 'loss': 0.0, 'examples': 0, 'rank': 0, 'train_accuracy': 0.0 } self.build_model() if init_model: print('Loading existing model parameters from ' + init_model) states = self.load(init_model) else: states = {} # Vectorize and save fixed/vocab candidates once upfront if applicable self.set_fixed_candidates(shared) self.set_vocab_candidates(shared) self.rank_loss = nn.CrossEntropyLoss(reduce=True, size_average=False) if self.use_cuda: self.model.cuda() self.rank_loss.cuda() # Vectorize and save fixed/vocab candidates once upfront if applicable self.set_fixed_candidates(shared) self.set_vocab_candidates(shared) if shared: # We don't use get here because hasattr is used on optimizer later. if 'optimizer' in shared: self.optimizer = shared['optimizer'] else: optim_params = [ p for p in self.model.parameters() if p.requires_grad ] self.init_optim(optim_params, states.get('optimizer'), states.get('optimizer_type')) self.build_lr_scheduler(states) if shared is None and is_distributed(): self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[self.opt['gpu']], broadcast_buffers=False, )
def __init__(self, opt, shared=None): super().__init__(opt, shared) self.data_parallel = opt.get('data_parallel') and self.use_cuda if self.data_parallel: from parlai.core.distributed_utils import is_distributed if is_distributed(): raise ValueError( 'Cannot combine --data-parallel and distributed mode') self.model = torch.nn.DataParallel(self.model)
def _sync_training_metrics(self, metrics): """ Sync training metrics across workers. A handful of special cases are handled as exceptions, and the remaining metrics are simply averaged across workers. """ if not is_distributed(): # nothing special needed return metrics all_versions = all_gather_list(metrics) return self._average_dicts(all_versions)
def __init__(self, opt, shared=None): super().__init__(opt, shared) self.rank_loss = torch.nn.CrossEntropyLoss(reduce=True, size_average=True) if self.use_cuda: self.rank_loss.cuda() self.data_parallel = opt.get('data_parallel') and self.use_cuda if self.data_parallel: from parlai.core.distributed_utils import is_distributed if is_distributed(): raise ValueError('Cannot combine --data-parallel and distributed mode') self.model = torch.nn.DataParallel(self.model)
def __init__(self, opt, shared=None): opt['rank_candidates'] = True super().__init__(opt, shared) # it's easier for now to use DataParallel when self.data_parallel = opt.get('data_parallel') and self.use_cuda if self.data_parallel: self.model = torch.nn.DataParallel(self.model) if is_distributed(): raise ValueError('Cannot combine --data-parallel and distributed mode') self.clip = -1 self.NULL_IDX = self.dict.pad_idx self.START_IDX = self.dict.start_idx self.END_IDX = self.dict.end_idx
def __init__(self, opt, shared=None): # download pretrained models download(opt['datapath']) self.pretrained_path = os.path.join(opt['datapath'], 'models', 'bert_models', MODEL_PATH) super().__init__(opt, shared) # it's easier for now to use DataParallel when self.data_parallel = opt.get('data_parallel') and self.use_cuda if self.data_parallel: self.model = torch.nn.DataParallel(self.model) if is_distributed(): raise ValueError( 'Cannot combine --data-parallel and distributed mode') self.clip = -1 self.NULL_IDX = self.dict.pad_idx self.START_IDX = self.dict.start_idx self.END_IDX = self.dict.end_idx
def __init__(self, opt, shared=None): opt['rank_candidates'] = True opt['candidates'] = "batch" if opt.get('eval_candidates', None) is None: opt['eval_candidates'] = "inline" self.clip = -1 super().__init__(opt, shared) # it's easier for now to use DataParallel when self.data_parallel = opt.get('data_parallel') and self.use_cuda if self.data_parallel: self.model = torch.nn.DataParallel(self.model) if is_distributed(): raise ValueError('Cannot combine --data-parallel and distributed mode') self.NULL_IDX = self.dict.pad_idx self.START_IDX = self.dict.start_idx self.END_IDX = self.dict.end_idx # default one does not average self.rank_loss = torch.nn.CrossEntropyLoss(reduce=True, size_average=True)
def __init__(self, opt, shared=None): # download pretrained models download(opt['datapath']) self.pretrained_path = os.path.join(opt['datapath'], 'models', 'bert_models', MODEL_PATH) opt['pretrained_path'] = self.pretrained_path self.clip = -1 super().__init__(opt, shared) # it's easier for now to use DataParallel when self.data_parallel = opt.get('data_parallel') and self.use_cuda if self.data_parallel and shared is None: self.model = torch.nn.DataParallel(self.model) if is_distributed(): raise ValueError( 'Cannot combine --data-parallel and distributed mode') self.NULL_IDX = self.dict.pad_idx self.START_IDX = self.dict.start_idx self.END_IDX = self.dict.end_idx # default one does not average self.rank_loss = torch.nn.CrossEntropyLoss(reduce=True, size_average=True)
def __init__(self, opt, shared=None): init_model, is_finetune = self._get_init_model(opt, shared) super().__init__(opt, shared) self.beam_dot_log = opt.get('beam_dot_log', False) self.beam_size = opt.get('beam_size', 1) self.beam_min_n_best = opt.get('beam_min_n_best', 3) self.beam_min_length = opt.get('beam_min_length', 3) self.beam_block_ngram = opt.get('beam_block_ngram', 0) self.skip_generation = opt.get('skip_generation', False) if shared: # set up shared properties self.model = shared['model'] self.criterion = shared['criterion'] self.metrics = shared['metrics'] states = shared.get('states', {}) else: # Note: we cannot change the type of metrics ahead of time, so you # should correctly initialize to floats or ints here self.metrics = { 'nll_loss': 0.0, 'loss': 0.0, 'num_tokens': 0, 'correct_tokens': 0, 'total_skipped_batches': 0 } # this is not a shared instance of this class, so do full init if self.beam_dot_log: self.beam_dot_dir = tempfile.mkdtemp( prefix='{}-beamdot-beamsize-{}-'.format( os.path.basename(opt.get('model_file')), self.beam_size)) print('[ Saving dot beam logs in {} ]'.format( self.beam_dot_dir)) self.build_criterion() self.build_model() if self.fp16: self.model = self.model.half() if init_model is not None: # load model parameters if available print('[ Loading existing model params from {} ]' ''.format(init_model)) states = self.load(init_model) else: states = {} if ( # only build an optimizer if we're training 'train' in opt.get('datatype', '') and # and this is the main model, or on every fork if doing hogwild (shared is None or self.opt.get('numthreads', 1) > 1)): # do this regardless of share state, but don't self.init_optim( [p for p in self.model.parameters() if p.requires_grad], optim_states=states.get('optimizer'), saved_optim_type=states.get('optimizer_type')) self.build_lr_scheduler(states, hard_reset=is_finetune) if shared is None and is_distributed(): self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[self.opt['gpu']], broadcast_buffers=False, ) self.reset()
def train(self): if is_distributed(): warn_once( "Distributed training outputs average-per-worker metrics during " "training, and may be slightly distorted. Validation/test are " "unadulterated.") opt = self.opt world = self.world with world: while True: # do one example / batch of examples world.parley() self.parleys += 1 # print(world.display()) # get the total training examples done, compute epochs self._total_epochs = ( self._preempted_epochs + num_workers() * self.world.get_total_epochs()) exs_per_epoch = self.world.num_examples() self._total_exs = int( np.round(self._total_epochs * exs_per_epoch)) # and use the primary worker's timings for everything train_time, log_time, validate_time = sync_object( (self.train_time.time(), self.log_time.time(), self.validate_time.time())) # check counters and timers if self._total_epochs >= self.max_num_epochs: self.log() print( '[ num_epochs completed:{} time elapsed:{}s ]'.format( self.max_num_epochs, train_time)) break if train_time > self.max_train_time: print('[ max_train_time elapsed:{}s ]'.format(train_time)) break if log_time > self.log_every_n_secs: self.log() if (validate_time > self.val_every_n_secs or self._total_epochs - self.last_valid_epoch >= self.val_every_n_epochs): stop_training = self.validate() self.last_valid_epoch = self._total_epochs if stop_training: break if (self.save_time.time() > self.save_every_n_secs and opt.get('model_file') and is_primary_worker()): print("[ saving model checkpoint: {}.checkpoint".format( opt['model_file'])) self.save_model('.checkpoint') self.save_time.reset() if not self.saved and is_primary_worker(): # save agent self.save_model() elif opt.get('model_file'): # reload best validation model self.agent = create_agent(opt) valid_world = _maybe_load_eval_world(self.agent, opt, 'valid') v_report = run_eval(valid_world, opt, 'valid', write_log=True) test_world = _maybe_load_eval_world(self.agent, opt, 'test') t_report = run_eval(test_world, opt, 'test', write_log=True) if valid_world: valid_world.shutdown() if test_world: test_world.shutdown() return v_report, t_report
def __init__(self, opt, shared=None): init_model, self.is_finetune = self._get_init_model(opt, shared) super().__init__(opt, shared) # set up classes if opt.get('classes') is None: raise RuntimeError('Must specify --classes argument.') if not shared: self.class_list = opt['classes'] self.class_dict = {val: i for i, val in enumerate(self.class_list)} if opt.get('class_weights', None) is not None: self.class_weights = opt['class_weights'] else: self.class_weights = [1.0 for c in self.class_list] self.reset_metrics() else: self.class_list = shared['class_list'] self.class_dict = shared['class_dict'] self.class_weights = shared['class_weights'] # get reference class; if opt['get_all_metrics'] is False, this is # used to compute metrics # in binary classfication, opt['threshold'] applies to ref class if opt['ref_class'] is None or opt['ref_class'] not in self.class_dict: self.ref_class = self.class_list[0] else: self.ref_class = opt['ref_class'] ref_class_id = self.class_list.index(self.ref_class) if ref_class_id != 0: # move to the front of the class list self.class_list.insert(0, self.class_list.pop(ref_class_id)) if not opt['get_all_metrics']: warn_once( 'Using %s as the class for computing P, R, and F1' % self.ref_class ) # set up threshold, only used in binary classification if len(self.class_list) == 2 and opt.get('threshold', 0.5) != 0.5: self.threshold = opt['threshold'] else: self.threshold = None # set up model and optimizers if shared: self.model = shared['model'] else: self.model = self.build_model() self.criterion = self.build_criterion() if self.model is None or self.criterion is None: raise AttributeError( 'build_model() and build_criterion() need to return the model or criterion' ) if self.use_cuda: self.model.cuda() self.criterion.cuda() if init_model: print('Loading existing model parameters from ' + init_model) self.load(init_model) if self.use_cuda: if self.opt['data_parallel']: if is_distributed(): raise ValueError( 'Cannot combine --data-parallel and distributed mode' ) self.model = torch.nn.DataParallel(self.model) if shared: # We don't use get here because hasattr is used on optimizer later. if 'optimizer' in shared: self.optimizer = shared['optimizer'] else: optim_params = [p for p in self.model.parameters() if p.requires_grad] self.init_optim(optim_params) self.build_lr_scheduler()
def build_dict(opt, skip_if_built=False): if isinstance(opt, ParlaiParser): print('[ Deprecated Warning: should be passed opt not Parser ]') opt = opt.parse_args() if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return if skip_if_built and os.path.isfile(opt['dict_file']): # Dictionary already built, skip all loading or setup print("[ dictionary already built .]") return None if is_distributed(): raise ValueError( 'Dictionaries should be pre-built before distributed train.') if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) if os.path.isfile(opt['dict_file']): # Dictionary already built, return loaded dictionary agent print("[ dictionary already built .]") return dictionary ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 ordered_opt['image_mode'] = 'none' ordered_opt['pytorch_teacher_batch_sort'] = False if ordered_opt['task'] == 'pytorch_teacher' or not ordered_opt['task']: pytorch_teacher_task = ordered_opt.get('pytorch_teacher_task', '') if pytorch_teacher_task != '': ordered_opt['task'] = pytorch_teacher_task datatypes = ['train:ordered:stream'] if opt.get('dict_include_valid'): datatypes.append('valid:stream') if opt.get('dict_include_test'): datatypes.append('test:stream') cnt = 0 for dt in datatypes: ordered_opt['datatype'] = dt world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary print('[ running dictionary over data.. ]') log_time = TimeLogger() total = world_dict.num_examples() if opt['dict_maxexs'] >= 0: total = min(total, opt['dict_maxexs']) log_every_n_secs = opt.get('log_every_n_secs', None) if log_every_n_secs: pbar = tqdm.tqdm(total=total, desc='Building dictionary', unit='ex', unit_scale=True) else: pbar = None while not world_dict.epoch_done(): cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] >= 0: print('Processed {} exs, moving on.'.format( opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() if pbar: pbar.update(1) if pbar: pbar.close() dictionary.save(opt['dict_file'], sort=True) print('[ dictionary built with {} tokens in {}s ]'.format( len(dictionary), round(log_time.total_time(), 2))) return dictionary
def __init__(self, opt, shared=None): init_model = None if not shared: # only do this on first setup # first check load path in case we need to override paths if opt.get('init_model') and os.path.isfile(opt['init_model']): # check first for 'init_model' for loading model from file init_model = opt['init_model'] if opt.get('model_file') and os.path.isfile(opt['model_file']): # next check for 'model_file', this would override init_model init_model = opt['model_file'] if init_model is not None: # if we are loading a model, should load its dict too if (os.path.isfile(init_model + '.dict') or opt['dict_file'] is None): opt['dict_file'] = init_model + '.dict' super().__init__(opt, shared) self.beam_dot_log = opt.get('beam_dot_log', False) self.beam_size = opt.get('beam_size', 1) self.beam_min_n_best = opt.get('beam_min_n_best', 3) self.beam_min_length = opt.get('beam_min_length', 3) self.beam_block_ngram = opt.get('beam_block_ngram', 0) self.skip_generation = opt.get('skip_generation', False) if shared: # set up shared properties self.model = shared['model'] self.criterion = shared['criterion'] self.metrics = shared['metrics'] states = shared.get('states', {}) else: self.metrics = { 'loss': 0.0, 'num_tokens': 0, 'correct_tokens': 0, 'total_skipped_batches': 0 } # this is not a shared instance of this class, so do full init if self.beam_dot_log: self.beam_dot_dir = tempfile.mkdtemp( prefix='{}-beamdot-beamsize-{}-'.format( os.path.basename(opt.get('model_file')), self.beam_size)) print('[ Saving dot beam logs in {} ]'.format( self.beam_dot_dir)) self.build_criterion() self.build_model() if init_model is not None: # load model parameters if available print('[ Loading existing model params from {} ]' ''.format(init_model)) states = self.load(init_model) else: states = {} if shared is None and is_distributed(): self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[self.opt['gpu']], broadcast_buffers=False, ) if 'train' in opt.get('datatype', ''): # do this regardless of share state, but don't self.init_optim( [p for p in self.model.parameters() if p.requires_grad], optim_states=states.get('optimizer'), saved_optim_type=states.get('optimizer_type')) self.build_lr_scheduler() self.reset()
def __init__(self, opt, shared=None): init_model, is_finetune = self._get_init_model(opt, shared) super().__init__(opt, shared) self.beam_size = opt.get('beam_size', 1) self.beam_min_n_best = opt.get('beam_min_n_best', 3) self.beam_min_length = opt.get('beam_min_length', 3) if opt.get('beam_block_ngram'): # check for old opts where we might have used beam blocking. # this was a super rare option, so I don't expect this to be used. raise RuntimeError('Beam ngram blocking is no longer supported.') if shared: # set up shared properties states = shared.get('states', {}) else: # Note: we cannot change the type of metrics ahead of time, so you # should correctly initialize to floats or ints here self.metrics['nll_loss'] = 0.0 self.metrics['loss'] = 0.0 self.metrics['correct_tokens'] = 0 self.metrics['total_skipped_batches'] = 0 # this is not a shared instance of this class, so do full init self.criterion = self.build_criterion() # ensure all distributed copies will always be in sync self.model = self.build_model() if self.model is None or self.criterion is None: raise AttributeError( 'build_model() and build_criterion() need to return the model or criterion' ) if self.use_cuda: self.model.cuda() self.criterion.cuda() check_synced_parameters(self.model) print("Total parameters: {}".format(self._total_parameters())) print("Trainable parameters: {}".format(self._trainable_parameters())) if self.fp16: self.model = self.model.half() if init_model is not None: # load model parameters if available print('[ Loading existing model params from {} ]' ''.format(init_model)) states = self.load(init_model) else: states = {} if ( # only build an optimizer if we're training 'train' in opt.get('datatype', '') # and this is the main model, or on every fork if doing hogwild and (shared is None or self.opt.get('numthreads', 1) > 1) ): # do this regardless of share state, but don't self.init_optim( [p for p in self.model.parameters() if p.requires_grad], optim_states=states.get('optimizer'), saved_optim_type=states.get('optimizer_type'), ) self.build_lr_scheduler(states, hard_reset=is_finetune) if shared is None and is_distributed(): self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[self.opt['gpu']], broadcast_buffers=False ) self.reset()
def train(self): if is_distributed(): warn_once( "Distributed training outputs average-per-worker metrics during " "training, and may be slightly distorted. Validation/test are " "unadulterated." ) opt = self.opt world = self.world with world: while True: # do one example / batch of examples world.parley() self.parleys += 1 # get the total training examples done, compute epochs self._total_epochs = ( self._preempted_epochs + num_workers() * self.world.get_total_epochs() ) exs_per_epoch = self.world.num_examples() self._total_exs = int(np.round(self._total_epochs * exs_per_epoch)) # and use the primary worker's timings for everything train_time, log_time, validate_time = sync_object(( self.train_time.time(), self.log_time.time(), self.validate_time.time() )) # check counters and timers if self._total_epochs >= self.max_num_epochs: self.log() print('[ num_epochs completed:{} time elapsed:{}s ]'.format( self.max_num_epochs, train_time)) break if train_time > self.max_train_time: print('[ max_train_time elapsed:{}s ]'.format(train_time)) break if log_time > self.log_every_n_secs: self.log() if ( validate_time > self.val_every_n_secs or self._total_epochs - self.last_valid_epoch >= self.val_every_n_epochs ): stop_training = self.validate() self.last_valid_epoch = self._total_epochs # --------------- change by hengyicai ------------------------- # run evaluation on the test data as well test_opt = copy.deepcopy(self.opt) test_opt['display_examples'] = False test_opt['report_freq'] = 0 if self.test_world is None: # we need to load the world now self.test_world = _maybe_load_eval_world(self.agent, test_opt, 'test') run_eval(self.test_world, test_opt, 'test', -1, write_log=True) # --------------- change by hengyicai ------------------------- if stop_training: break if ( self.save_time.time() > self.save_every_n_secs and opt.get('model_file') and is_primary_worker() ): print("[ saving model checkpoint: {}.checkpoint".format( opt['model_file'] )) self.save_model('.checkpoint') self.save_time.reset() if not self.saved and is_primary_worker(): # save agent self.save_model() elif opt.get('model_file'): # reload best validation model self.agent = create_agent(opt) valid_world = _maybe_load_eval_world(self.agent, opt, 'valid') max_exs = opt['validation_max_exs'] if opt.get('short_final_eval') else -1 v_report = run_eval(valid_world, opt, 'valid', max_exs, write_log=True) test_world = _maybe_load_eval_world(self.agent, opt, 'test') t_report = run_eval(test_world, opt, 'test', max_exs, write_log=True) if valid_world: valid_world.shutdown() if test_world: test_world.shutdown() # --------------- change by hengyicai ------------------------- last_model = opt.get('model_file') + '.checkpoint' if os.path.isfile(last_model): print('[ Conducting evaluations on valid and test data using the last model. ]') last_model_opt = copy.deepcopy(opt) last_model_opt['model_file'] = last_model last_agent = create_agent(last_model_opt) valid_world = _maybe_load_eval_world(last_agent, last_model_opt, 'valid') max_exs = last_model_opt['validation_max_exs'] if last_model_opt.get('short_final_eval') else -1 run_eval(valid_world, last_model_opt, 'valid', max_exs, write_log=True) test_world = _maybe_load_eval_world(last_agent, last_model_opt, 'test') run_eval(test_world, last_model_opt, 'test', max_exs, write_log=True) if valid_world: valid_world.shutdown() if test_world: test_world.shutdown() # --------------- change by hengyicai ------------------------- print_announcements(opt) return v_report, t_report