def __init__(self, hparams, **kwargs): # when loading from a pytorch lightning checkpoint, hparams are passed as dict if isinstance(hparams, dict): hparams = AttrDict(hparams) if hparams.model_type == "rag_sequence": self.model_class = RagSequenceForGeneration elif hparams.model_type == "rag_token": self.model_class = RagTokenForGeneration elif hparams.model_type == "bart": self.model_class = BartForConditionalGeneration else: self.model_class = T5ForConditionalGeneration self.is_rag_model = is_rag_model(hparams.model_type) config_class = RagConfig if self.is_rag_model else AutoConfig config = config_class.from_pretrained(hparams.model_name_or_path) # set retriever parameters config.index_name = hparams.index_name or config.index_name config.passages_path = hparams.passages_path or config.passages_path config.index_path = hparams.index_path or config.index_path config.use_dummy_dataset = hparams.use_dummy_dataset # set extra_model_params for generator configs and load_model extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "attention_dropout", "dropout") if self.is_rag_model: if hparams.prefix is not None: config.generator.prefix = hparams.prefix config.label_smoothing = hparams.label_smoothing hparams, config.generator = set_extra_model_params( extra_model_params, hparams, config.generator) if hparams.distributed_retriever == "ray": # The Ray retriever needs the handles to the retriever actors. retriever = RagRayDistributedRetriever.from_pretrained( hparams.model_name_or_path, hparams.actor_handles, config=config) if hparams.end2end: ctx_encoder_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( "facebook/dpr-ctx_encoder-multiset-base") retriever.set_ctx_encoder_tokenizer(ctx_encoder_tokenizer) else: logger.info( "please use RAY as the distributed retrieval method") model = self.model_class.from_pretrained( hparams.model_name_or_path, config=config, retriever=retriever) if hparams.end2end: ctx_encoder = DPRContextEncoder.from_pretrained( hparams.context_encoder_name) model.set_context_encoder_for_training(ctx_encoder) prefix = config.question_encoder.prefix else: if hparams.prefix is not None: config.prefix = hparams.prefix hparams, config = set_extra_model_params(extra_model_params, hparams, config) model = self.model_class.from_pretrained( hparams.model_name_or_path, config=config) prefix = config.prefix tokenizer = (RagTokenizer.from_pretrained(hparams.model_name_or_path) if self.is_rag_model else AutoTokenizer.from_pretrained( hparams.model_name_or_path)) self.config_dpr = DPRConfig.from_pretrained( hparams.context_encoder_name) self.custom_config = hparams self.context_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( hparams.context_encoder_name) super().__init__(hparams, config=config, tokenizer=tokenizer, model=model) save_git_info(self.hparams.output_dir) self.output_dir = Path(self.hparams.output_dir) self.dpr_ctx_check_dir = str(Path( self.hparams.output_dir)) + "/dpr_ctx_checkpoint" self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.distributed_port = self.hparams.distributed_port # For single GPU training, init_ddp_connection is not called. # So we need to initialize the retrievers here. if hparams.gpus <= 1: if hparams.distributed_retriever == "ray": self.model.retriever.init_retrieval() else: logger.info( "please use RAY as the distributed retrieval method") self.distributed_retriever = hparams.distributed_retriever
def __init__(self, hparams, **kwargs): # when loading from a pytorch lightning checkpoint, hparams are passed as dict if isinstance(hparams, dict): hparams = AttrDict(hparams) if hparams.model_type == "rag_sequence": self.model_class = RagSequenceForGeneration elif hparams.model_type == "rag_token": self.model_class = RagTokenForGeneration elif hparams.model_type == "bart": self.model_class = BartForConditionalGeneration else: self.model_class = T5ForConditionalGeneration self.is_rag_model = is_rag_model(hparams.model_type) config_class = RagConfig if self.is_rag_model else AutoConfig config = config_class.from_pretrained(hparams.model_name_or_path) # set retriever parameters config.index_name = hparams.index_name or config.index_name config.passages_path = hparams.passages_path or config.passages_path config.index_path = hparams.index_path or config.index_path config.use_dummy_dataset = hparams.use_dummy_dataset config.n_docs = 4 config.n_docs_splits = 4 config.max_combined_length = 500 config.n_words_to_src = 40 # using 40 tokens to add to src config.skip_ec = False config.bart_base_qe = True # using bart encoder as qe config.do_deduplication = True # set extra_model_params for generator configs and load_model extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "attention_dropout", "dropout") if self.is_rag_model: if hparams.prefix is not None: config.generator.prefix = hparams.prefix config.label_smoothing = hparams.label_smoothing hparams, config.generator = set_extra_model_params( extra_model_params, hparams, config.generator) if hparams.distributed_retriever == "pytorch": retriever = RagPyTorchDistributedRetriever.from_pretrained( hparams.model_name_or_path, config=config) elif hparams.distributed_retriever == "ray": # The Ray retriever needs the handles to the retriever actors. retriever = RagRayDistributedRetriever.from_pretrained( hparams.model_name_or_path, hparams.actor_handles, config=config) model = self.model_class.from_pretrained( hparams.model_name_or_path, config=config, retriever=retriever) prefix = config.question_encoder.prefix else: if hparams.prefix is not None: config.prefix = hparams.prefix hparams, config = set_extra_model_params(extra_model_params, hparams, config) model = self.model_class.from_pretrained( hparams.model_name_or_path, config=config) prefix = config.prefix tokenizer = (RagTokenizer.from_pretrained(hparams.model_name_or_path) if self.is_rag_model else AutoTokenizer.from_pretrained( hparams.model_name_or_path)) # if the bart base qe wants to be used if config.bart_base_qe: #print("yuh") # load bbforrag bart_base_model = BartForConditionalGeneration.from_pretrained( "facebook/bart-base").cuda() model.question_encoder = bart_base_model.model.encoder #sys.exit() super().__init__(hparams, config=config, tokenizer=tokenizer, model=model) save_git_info(self.hparams.output_dir) self.output_dir = Path(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.distributed_port = self.hparams.distributed_port # For single GPU training, init_ddp_connection is not called. # So we need to initialize the retrievers here. if hparams.gpus <= 1: if hparams.distributed_retriever == "ray": self.model.retriever.init_retrieval() elif hparams.distributed_retriever == "pytorch": self.model.retriever.init_retrieval(self.distributed_port) self.distributed_retriever = hparams.distributed_retriever self.source_tokenizer = (self.tokenizer.question_encoder if isinstance( self.tokenizer, RagTokenizer) else self.tokenizer)
def __init__(self, hparams, **kwargs): if isinstance(hparams, dict): hparams = AttrDict(hparams) if hparams.model_type == "bart": self.model_class = BartForConditionalGeneration else: self.model_class = T5ForConditionalGeneration config_class = AutoConfig config = config_class.from_pretrained(hparams.model_name_or_path) # set extra_model_params for generator configs and load_model extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "attention_dropout", "dropout") if hparams.prefix is not None: config.prefix = hparams.prefix hparams, config = set_extra_model_params(extra_model_params, hparams, config) model = self.model_class.from_pretrained(hparams.model_name_or_path, config=config) prefix = config.prefix tokenizer = (AutoTokenizer.from_pretrained(hparams.model_name_or_path)) super().__init__(hparams, config=config, tokenizer=tokenizer, model=model) save_git_info(self.hparams.output_dir) self.output_dir = Path(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.distributed_port = self.hparams.distributed_port
def __init__(self, hparams, **kwargs): # when loading from a pytorch lightning checkpoint, hparams are passed as dict if isinstance(hparams, dict): hparams = AttrDict(hparams) if hparams.model_type == "rag_sequence": self.model_class = RagSequenceForGeneration elif hparams.model_type == "rag_token": self.model_class = RagTokenForGeneration elif hparams.model_type == "bart": self.model_class = BartForConditionalGeneration else: self.model_class = T5ForConditionalGeneration self.is_rag_model = is_rag_model(hparams.model_type) config_class = RagConfig if self.is_rag_model else AutoConfig config = config_class.from_pretrained(hparams.model_name_or_path) # set retriever parameters config.n_docs = hparams.n_docs config.do_marginalize = hparams.do_marginalize or config.do_marginalize config.scoring_func = hparams.scoring_func or config.scoring_func logger.info("Using scoring function - {}".format(config.scoring_func)) config.segmentation = hparams.segmentation or config.segmentation config.max_combined_length = hparams.max_combined_length or config.max_combined_length config.max_source_length = hparams.max_source_length or config.max_source_length config.index_name = hparams.index_name or config.index_name config.passages_path = hparams.passages_path or config.passages_path config.index_path = hparams.index_path or config.index_path config.use_dummy_dataset = hparams.use_dummy_dataset if hparams.bm25: # hparams.bm25 = load_bm25_results(hparams.bm25) bm25 = load_bm25(hparams.bm25) config.bm25 = hparams.bm25 else: bm25 = None # set extra_model_params for generator configs and load_model extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "attention_dropout", "dropout") if self.is_rag_model: if hparams.prefix is not None: config.generator.prefix = hparams.prefix config.label_smoothing = hparams.label_smoothing hparams, config.generator = set_extra_model_params( extra_model_params, hparams, config.generator) if hparams.distributed_retriever == "pytorch": # pdb.set_trace() retriever = RagPyTorchDistributedRetriever.from_pretrained( hparams.model_name_or_path, config=config) elif hparams.distributed_retriever == "ray": # The Ray retriever needs the handles to the retriever actors. retriever = RagRayDistributedRetriever.from_pretrained( hparams.model_name_or_path, hparams.actor_handles, config=config) model = self.model_class.from_pretrained( hparams.model_name_or_path, config=config, retriever=retriever, bm25=bm25) prefix = config.question_encoder.prefix model.bm25 = bm25 else: if hparams.prefix is not None: config.prefix = hparams.prefix hparams, config = set_extra_model_params(extra_model_params, hparams, config) model = self.model_class.from_pretrained( hparams.model_name_or_path, config=config) prefix = config.prefix tokenizer = (RagTokenizer.from_pretrained(hparams.model_name_or_path) if self.is_rag_model else AutoTokenizer.from_pretrained( hparams.model_name_or_path)) super().__init__(hparams, config=config, tokenizer=tokenizer, model=model) save_git_info(self.hparams.output_dir) self.output_dir = Path(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.distributed_port = self.hparams.distributed_port # For single GPU training, init_ddp_connection is not called. # So we need to initialize the retrievers here. if hparams.gpus <= 1: if hparams.distributed_retriever == "ray": self.model.retriever.init_retrieval() elif hparams.distributed_retriever == "pytorch": self.model.retriever.init_retrieval(self.distributed_port) self.distributed_retriever = hparams.distributed_retriever