def __init__(self, opt, shared=None): super().__init__(opt, shared) self.model_backwards = None self.max_output_length = None self.model_backwards = self.build_model_backwards() if self.model_backwards is None: raise AttributeError( 'build_model() and build_criterion() need to return the model or criterion' ) if self.use_cuda: self.model_backwards.cuda() sync_parameters(self.model_backwards) print("Total backwards parameters: {}".format( self._total_parameters())) print("Trainable backwards parameters: {}".format( self._trainable_parameters())) if self.fp16: self.model_backwards = self.model_backwards.half() if shared is None and is_distributed(): self.model_backwards = torch.nn.parallel.DistributedDataParallel( self.model_backwards, device_ids=[self.opt['gpu']], broadcast_buffers=False)
def build_regret_model(self) -> RagModel: """ Build and return regret RagModel. Assume dictionary is the same. """ model_file = self.opt['regret_model_file'] if model_file: assert os.path.exists( model_file), 'specify correct path for --regret-model-file' regret_opt = Opt.load(f'{model_file}.opt') regret_opt['n_docs'] = self.opt[ 'n_docs'] # Urgent that this is the same # add keys that were not in this model when originally trained regret_opt.update( {k: v for k, v in self.opt.items() if k not in regret_opt}) retriever_shared = None if all([ regret_opt[k] == self.opt[k] for k in [ 'rag_retriever_type', 'path_to_index', 'path_to_dpr_passages', ] ]): logging.warning( 'Sharing retrievers between model and regret model!') retriever_shared = self.model.encoder.retriever.share() model = RagModel(regret_opt, self.dict, retriever_shared=retriever_shared) with PathManager.open(self.opt['regret_model_file'], 'rb') as f: states = torch.load( f, map_location=lambda cpu, _: cpu, pickle_module=parlai.utils.pickle, ) assert 'model' in states model.load_state_dict(states['model']) if self.model_parallel: ph = PipelineHelper() ph.check_compatibility(self.opt) self.regret_model = ph.make_parallel(self.regret_model) else: self.regret_model.cuda() if self.fp16: self.regret_model = self.regret_model.half() sync_parameters(self.regret_model) train_params = trainable_parameters(self.regret_model) total_params = total_parameters(self.regret_model) logging.info( f"Total regret parameters: {total_params:,d} ({train_params:,d} trainable)" ) else: model = self.model return model
def __init__(self, opt: Opt, shared=None): init_model, is_finetune = self._get_init_model(opt, shared) super().__init__(opt, shared) self.beam_size = opt.get('beam_size', 1) self.beam_min_length = opt.get('beam_min_length', 1) self.beam_block_ngram = opt.get('beam_block_ngram', -1) self.beam_context_block_ngram = opt.get('beam_context_block_ngram', -1) self.output_token_losses = opt.get('verbose', False) self.compute_tokenized_bleu = opt.get('compute_tokenized_bleu', False) if shared: # set up shared properties states = shared.get('states', {}) else: # Note: we cannot change the type of metrics ahead of time, so you # should correctly initialize to floats or ints here # this is not a shared instance of this class, so do full init self.criterion = self.build_criterion() # ensure all distributed copies will always be in sync self.model = self.build_model() if self.model is None or self.criterion is None: raise AttributeError( 'build_model() and build_criterion() need to return the model or criterion' ) if self.use_cuda: self.model.cuda() self.criterion.cuda() sync_parameters(self.model) print("Total parameters: {}".format(self._total_parameters())) print("Trainable parameters: {}".format(self._trainable_parameters())) if self.fp16: self.model = self.model.half() if init_model is not None: # load model parameters if available print('[ Loading existing model params from {} ]' ''.format(init_model)) states = self.load(init_model) else: states = {} if ( # only build an optimizer if we're training 'train' in opt.get('datatype', '') # and this is the main model, or on every fork if doing hogwild and (shared is None or self.opt.get('numthreads', 1) > 1) ): # do this regardless of share state, but don't self.init_optim( [p for p in self.model.parameters() if p.requires_grad], optim_states=states.get('optimizer'), saved_optim_type=states.get('optimizer_type'), ) self.build_lr_scheduler(states, hard_reset=is_finetune) if shared is None and is_distributed(): self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[self.opt['gpu']], broadcast_buffers=False ) self.reset()