def initialize(self, resources: Resources, configs: Config): self.resource = resources self.config = configs self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") if "name" in self.config.tokenizer: self.tokenizer = BERTTokenizer( pretrained_model_name=self.config.tokenizer.name) if "name" in self.config.model: self.encoder = BERTEncoder( pretrained_model_name=self.config.model.name) else: self.encoder = BERTEncoder( pretrained_model_name=None, hparams={"pretrained_model_name": None}, ) with open(self.config.model.path, "rb") as f: state_dict = pickle.load(f) self.encoder.load_state_dict(state_dict["bert"]) self.encoder.to(self.device)
def initialize(self, resources: Resources, configs: HParams): self.resource = resources vocab_file = configs.vocab_file self.tokenizer = BERTTokenizer.load(vocab_file) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.encoder = BERTEncoder(pretrained_model_name="bert-base-uncased") self.encoder.to(self.device)
class BertBasedQueryCreator(QueryProcessor): r"""This processor searches relevant documents for a query""" # pylint: disable=useless-super-delegation def __init__(self) -> None: super().__init__() def initialize(self, resources: Resources, configs: Config): self.resource = resources self.config = configs self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if "name" in self.config.tokenizer: self.tokenizer = \ BERTTokenizer(pretrained_model_name=self.config.tokenizer.name) if "name" in self.config.model: self.encoder = BERTEncoder( pretrained_model_name=self.config.model.name) else: self.encoder = BERTEncoder(pretrained_model_name=None, hparams={"pretrained_model_name": None}) with open(self.config.model.path, "rb") as f: state_dict = pickle.load(f) self.encoder.load_state_dict(state_dict["bert"]) self.encoder.to(self.device) @classmethod def default_configs(cls) -> Dict[str, Any]: config = super().default_configs() config.update({ "model": { 'path': None, "name": "bert-base-uncased", }, "tokenizer": { "name": "bert-base-uncased" }, "max_seq_length": 128, "query_pack_name": "query" }) return config @torch.no_grad() def get_embeddings(self, inputs, sequence_length, segment_ids): output, _ = self.encoder(inputs=inputs, sequence_length=sequence_length, segment_ids=segment_ids) cls_token = output[:, 0, :] return cls_token def _build_query(self, text: str) -> np.ndarray: input_ids, segment_ids, input_mask = \ self.tokenizer.encode_text( text_a=text, max_seq_length=self.config.max_seq_length) input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(self.device) segment_ids = torch.LongTensor(segment_ids).unsqueeze(0).to(self.device) input_mask = torch.LongTensor(input_mask).unsqueeze(0).to(self.device) sequence_length = (~(input_mask == 0)).sum(dim=1) query_vector = self.get_embeddings(inputs=input_ids, sequence_length=sequence_length, segment_ids=segment_ids) query_vector = torch.mean(query_vector, dim=0, keepdim=True) query_vector = query_vector.cpu().numpy() return query_vector def _process_query(self, input_pack: MultiPack) \ -> Tuple[DataPack, Dict[str, Any]]: query_pack: DataPack = input_pack.get_pack(self.config.query_pack_name) context = [query_pack.text] # use context to build the query if "user_utterance" in input_pack.pack_names: user_pack = input_pack.get_pack("user_utterance") context.append(user_pack.text) if "bot_utterance" in input_pack.pack_names: bot_pack = input_pack.get_pack("bot_utterance") context.append(bot_pack.text) text = ' '.join(context) query_vector = self._build_query(text=text) return query_pack, query_vector
max_seq_length = 512 print(f"Processing the IMDB reviews...") processor_class = get_processor_class("IMDB") imdb_processor = processor_class("data/IMDB") train_examples = imdb_processor.get_train_examples() dev_examples = imdb_processor.get_dev_examples() reviews = [ example.text_a for dataset in [train_examples, dev_examples] for example in dataset ] # create a BERT tokenizer vocab_file = "data/pretrained_models/uncased_L-12_H-768_A-12/vocab.txt" tokenizer = BERTTokenizer.load(vocab_file) # BERT encoder device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = BERTEncoder(pretrained_model_name="bert-base-uncased") encoder.to(device) print(f"Encoding the text using BERT Tokenizer...") feature_original_types = { "id": ["int64", "FixedLenFeature"], "input_ids": ["int64", "FixedLenFeature", max_seq_length], "segment_ids": ["int64", "FixedLenFeature", max_seq_length], "text": ["str", "FixedLenFeature"] } with RecordData.writer("data/imdb.pkl", feature_original_types) as writer: