def __init__(self, opt): super().__init__(opt) # initialize from vocab path warn_once( "WARNING: BERT uses a Hugging Face tokenizer; ParlAI dictionary args are ignored" ) download(opt["datapath"]) vocab_path = PathManager.get_local_path( os.path.join(opt["datapath"], "models", "bert_models", VOCAB_PATH)) self.tokenizer = BertTokenizer.from_pretrained(vocab_path) self.start_token = "[CLS]" self.end_token = "[SEP]" self.null_token = "[PAD]" self.start_idx = self.tokenizer.convert_tokens_to_ids( ["[CLS]"])[0] # should be 101 self.end_idx = self.tokenizer.convert_tokens_to_ids( ["[SEP]"])[0] # should be 102 self.pad_idx = self.tokenizer.convert_tokens_to_ids( ["[PAD]"])[0] # should be 0 # set tok2ind for special tokens self.tok2ind[self.start_token] = self.start_idx self.tok2ind[self.end_token] = self.end_idx self.tok2ind[self.null_token] = self.pad_idx # set ind2tok for special tokens self.ind2tok[self.start_idx] = self.start_token self.ind2tok[self.end_idx] = self.end_token self.ind2tok[self.pad_idx] = self.null_token
def __init__(self, opt, shared=None): # download pretrained models download(opt["datapath"]) self.pretrained_path = PathManager.get_local_path( os.path.join(opt["datapath"], "models", "bert_models", MODEL_PATH)) opt["pretrained_path"] = self.pretrained_path self.add_cls_token = opt.get("add_cls_token", True) self.sep_last_utt = opt.get("sep_last_utt", False) super().__init__(opt, shared)
def __init__( self, opt: Opt, dpr_model: str = 'bert', pretrained_path: str = DPR_ZOO_MODEL, encoder_type: str = 'query', ): # Override options try: config: BertConfig = BertConfig.from_pretrained( 'bert-base-uncased') except OSError: config_path = PathManager.get_local_path( os.path.join(opt['datapath'], "bert_base_uncased", self.CONFIG_PATH)) config: BertConfig = BertConfig.from_pretrained(config_path) pretrained_path = modelzoo_path(opt['datapath'], pretrained_path) # type: ignore if not os.path.exists(pretrained_path): # when initializing from parlai rag models, the pretrained path # may not longer exist. This is fine if we've already trained # the model. assert dpr_model == 'bert_from_parlai_rag' logging.error(f'Pretrained Path does not exist: {pretrained_path}') pretrained_path = modelzoo_path(opt['datapath'], DPR_ZOO_MODEL) # type: ignore dpr_model = 'bert' logging.error(f'Setting to zoo model: {pretrained_path}') enc_opt = { "n_heads": config.num_attention_heads, "n_layers": config.num_hidden_layers, "embedding_size": config.hidden_size, "ffn_size": config.intermediate_size, "dropout": config.hidden_dropout_prob, "attention_dropout": config.attention_probs_dropout_prob, "activation": config.hidden_act, "variant": 'xlm', "reduction_type": 'first', "n_positions": config.max_position_embeddings, "n_segments": config.type_vocab_size, } embedding = torch.nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) super().__init__( Opt(enc_opt), vocabulary_size=config.vocab_size, padding_idx=config.pad_token_id, embedding=embedding, reduction_type='first', ) self._load_state(opt['datapath'], dpr_model, pretrained_path, encoder_type)
def _init_from_pretrained(self, opt): # load model model_sz = opt["gpt2_size"] if model_sz == "small": model_key = "gpt2" elif model_sz == "distilgpt2": model_key = "distilgpt2" else: model_key = f"gpt2-{model_sz}" # check if datapath has the files that hugging face code looks for hf_dir = os.path.join(opt["datapath"], "hf", model_key) if all( PathManager.exists(os.path.join(hf_dir, file_name)) for file_name in ["pytorch_model.bin", "config.json"]): fle_key = PathManager.get_local_path(hf_dir, recursive=True) else: fle_key = model_key return GPT2Model.from_pretrained(fle_key)
def _init_tokenizer( self, dictionary: DictionaryAgent ) -> Union[BertTokenizer, DictionaryAgent]: """ If a regular parlai model, use the regular dictionary. Otherwise, build as necessary :param dictionary: ParlAI dictionary agent """ if self.query_model in ['bert', 'bert_from_parlai_rag']: try: return BertTokenizer.from_pretrained('bert-base-uncased') except (ImportError, OSError): vocab_path = PathManager.get_local_path( os.path.join(self.datapath, "bert_base_uncased", self.VOCAB_PATH) ) return transformers.BertTokenizer.from_pretrained(vocab_path) else: return dictionary
def get_tokenizer(self, opt): """ Instantiate tokenizer. """ model_sz = opt["gpt2_size"] if model_sz == "small": model_key = "gpt2" elif model_sz == "distilgpt2": model_key = "distilgpt2" else: model_key = f"gpt2-{model_sz}" # check if datapath has the files that hugging face code looks for hf_dir = os.path.join(opt["datapath"], "hf", model_key) if all( PathManager.exists(os.path.join(hf_dir, file_name)) for file_name in ["merges.txt", "vocab.json"]): fle_key = PathManager.get_local_path(hf_dir, recursive=True) else: fle_key = model_key return GPT2Tokenizer.from_pretrained(fle_key)
def load_bert_state( datapath: str, state_dict: Dict[str, torch.Tensor], pretrained_dpr_path: str, encoder_type: str = 'query', ) -> Dict[str, torch.Tensor]: """ Load BERT State from HF Model, convert to ParlAI Model. :param state_dict: ParlAI model state_dict :param pretrained_dpr_path: path to pretrained DPR model :param encoder_type: whether we're loading a document or query encoder. :return new_state_dict: return a state_dict with loaded weights. """ try: bert_model = BertModel.from_pretrained('bert-base-uncased') except OSError: model_path = PathManager.get_local_path( os.path.join(datapath, "bert_base_uncased")) bert_model = BertModel.from_pretrained(model_path) if pretrained_dpr_path: BertConversionUtils.load_dpr_model(bert_model, pretrained_dpr_path, encoder_type) bert_state_dict = bert_model.state_dict() for key in BERT_COMPATIBILITY_KEYS: bert_state_dict.pop(key, None) return_dict = BertConversionUtils.convert_bert_to_parlai( bert_state_dict) assert all( a in return_dict for a in state_dict ), f"not all weights are being loaded: {[k for k in state_dict if k not in return_dict]}" return return_dict