示例#1
0
    def __init__(self, opt):
        super().__init__(opt)
        # initialize from vocab path
        warn_once(
            "WARNING: BERT uses a Hugging Face tokenizer; ParlAI dictionary args are ignored"
        )
        download(opt["datapath"])
        vocab_path = PathManager.get_local_path(
            os.path.join(opt["datapath"], "models", "bert_models", VOCAB_PATH))
        self.tokenizer = BertTokenizer.from_pretrained(vocab_path)

        self.start_token = "[CLS]"
        self.end_token = "[SEP]"
        self.null_token = "[PAD]"
        self.start_idx = self.tokenizer.convert_tokens_to_ids(
            ["[CLS]"])[0]  # should be 101
        self.end_idx = self.tokenizer.convert_tokens_to_ids(
            ["[SEP]"])[0]  # should be 102
        self.pad_idx = self.tokenizer.convert_tokens_to_ids(
            ["[PAD]"])[0]  # should be 0
        # set tok2ind for special tokens
        self.tok2ind[self.start_token] = self.start_idx
        self.tok2ind[self.end_token] = self.end_idx
        self.tok2ind[self.null_token] = self.pad_idx
        # set ind2tok for special tokens
        self.ind2tok[self.start_idx] = self.start_token
        self.ind2tok[self.end_idx] = self.end_token
        self.ind2tok[self.pad_idx] = self.null_token
示例#2
0
 def __init__(self, opt, shared=None):
     # download pretrained models
     download(opt["datapath"])
     self.pretrained_path = PathManager.get_local_path(
         os.path.join(opt["datapath"], "models", "bert_models", MODEL_PATH))
     opt["pretrained_path"] = self.pretrained_path
     self.add_cls_token = opt.get("add_cls_token", True)
     self.sep_last_utt = opt.get("sep_last_utt", False)
     super().__init__(opt, shared)
示例#3
0
    def __init__(
        self,
        opt: Opt,
        dpr_model: str = 'bert',
        pretrained_path: str = DPR_ZOO_MODEL,
        encoder_type: str = 'query',
    ):
        # Override options
        try:
            config: BertConfig = BertConfig.from_pretrained(
                'bert-base-uncased')
        except OSError:
            config_path = PathManager.get_local_path(
                os.path.join(opt['datapath'], "bert_base_uncased",
                             self.CONFIG_PATH))
            config: BertConfig = BertConfig.from_pretrained(config_path)

        pretrained_path = modelzoo_path(opt['datapath'],
                                        pretrained_path)  # type: ignore
        if not os.path.exists(pretrained_path):
            # when initializing from parlai rag models, the pretrained path
            # may not longer exist. This is fine if we've already trained
            # the model.
            assert dpr_model == 'bert_from_parlai_rag'
            logging.error(f'Pretrained Path does not exist: {pretrained_path}')
            pretrained_path = modelzoo_path(opt['datapath'],
                                            DPR_ZOO_MODEL)  # type: ignore
            dpr_model = 'bert'
            logging.error(f'Setting to zoo model: {pretrained_path}')
        enc_opt = {
            "n_heads": config.num_attention_heads,
            "n_layers": config.num_hidden_layers,
            "embedding_size": config.hidden_size,
            "ffn_size": config.intermediate_size,
            "dropout": config.hidden_dropout_prob,
            "attention_dropout": config.attention_probs_dropout_prob,
            "activation": config.hidden_act,
            "variant": 'xlm',
            "reduction_type": 'first',
            "n_positions": config.max_position_embeddings,
            "n_segments": config.type_vocab_size,
        }
        embedding = torch.nn.Embedding(config.vocab_size,
                                       config.hidden_size,
                                       padding_idx=config.pad_token_id)
        super().__init__(
            Opt(enc_opt),
            vocabulary_size=config.vocab_size,
            padding_idx=config.pad_token_id,
            embedding=embedding,
            reduction_type='first',
        )

        self._load_state(opt['datapath'], dpr_model, pretrained_path,
                         encoder_type)
示例#4
0
    def _init_from_pretrained(self, opt):
        # load model
        model_sz = opt["gpt2_size"]
        if model_sz == "small":
            model_key = "gpt2"
        elif model_sz == "distilgpt2":
            model_key = "distilgpt2"
        else:
            model_key = f"gpt2-{model_sz}"

        # check if datapath has the files that hugging face code looks for
        hf_dir = os.path.join(opt["datapath"], "hf", model_key)
        if all(
                PathManager.exists(os.path.join(hf_dir, file_name))
                for file_name in ["pytorch_model.bin", "config.json"]):
            fle_key = PathManager.get_local_path(hf_dir, recursive=True)
        else:
            fle_key = model_key
        return GPT2Model.from_pretrained(fle_key)
示例#5
0
    def _init_tokenizer(
        self, dictionary: DictionaryAgent
    ) -> Union[BertTokenizer, DictionaryAgent]:
        """
        If a regular parlai model, use the regular dictionary.

        Otherwise, build as necessary

        :param dictionary:
            ParlAI dictionary agent
        """
        if self.query_model in ['bert', 'bert_from_parlai_rag']:
            try:
                return BertTokenizer.from_pretrained('bert-base-uncased')
            except (ImportError, OSError):
                vocab_path = PathManager.get_local_path(
                    os.path.join(self.datapath, "bert_base_uncased", self.VOCAB_PATH)
                )
                return transformers.BertTokenizer.from_pretrained(vocab_path)
        else:
            return dictionary
示例#6
0
    def get_tokenizer(self, opt):
        """
        Instantiate tokenizer.
        """
        model_sz = opt["gpt2_size"]
        if model_sz == "small":
            model_key = "gpt2"
        elif model_sz == "distilgpt2":
            model_key = "distilgpt2"
        else:
            model_key = f"gpt2-{model_sz}"
        # check if datapath has the files that hugging face code looks for
        hf_dir = os.path.join(opt["datapath"], "hf", model_key)
        if all(
                PathManager.exists(os.path.join(hf_dir, file_name))
                for file_name in ["merges.txt", "vocab.json"]):
            fle_key = PathManager.get_local_path(hf_dir, recursive=True)

        else:
            fle_key = model_key
        return GPT2Tokenizer.from_pretrained(fle_key)
示例#7
0
    def load_bert_state(
        datapath: str,
        state_dict: Dict[str, torch.Tensor],
        pretrained_dpr_path: str,
        encoder_type: str = 'query',
    ) -> Dict[str, torch.Tensor]:
        """
        Load BERT State from HF Model, convert to ParlAI Model.

        :param state_dict:
            ParlAI model state_dict
        :param pretrained_dpr_path:
            path to pretrained DPR model
        :param encoder_type:
            whether we're loading a document or query encoder.

        :return new_state_dict:
            return a state_dict with loaded weights.
        """

        try:
            bert_model = BertModel.from_pretrained('bert-base-uncased')
        except OSError:
            model_path = PathManager.get_local_path(
                os.path.join(datapath, "bert_base_uncased"))
            bert_model = BertModel.from_pretrained(model_path)

        if pretrained_dpr_path:
            BertConversionUtils.load_dpr_model(bert_model, pretrained_dpr_path,
                                               encoder_type)
        bert_state_dict = bert_model.state_dict()
        for key in BERT_COMPATIBILITY_KEYS:
            bert_state_dict.pop(key, None)
        return_dict = BertConversionUtils.convert_bert_to_parlai(
            bert_state_dict)

        assert all(
            a in return_dict for a in state_dict
        ), f"not all weights are being loaded: {[k for k in state_dict if k not in return_dict]}"
        return return_dict