示例#1
0
    def initialize(self, resources: Resources, configs: Config):
        self.resource = resources
        self.config = configs

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        if "name" in self.config.tokenizer:
            self.tokenizer = BERTTokenizer(
                pretrained_model_name=self.config.tokenizer.name)

        if "name" in self.config.model:
            self.encoder = BERTEncoder(
                pretrained_model_name=self.config.model.name)

        else:
            self.encoder = BERTEncoder(
                pretrained_model_name=None,
                hparams={"pretrained_model_name": None},
            )
            with open(self.config.model.path, "rb") as f:
                state_dict = pickle.load(f)
            self.encoder.load_state_dict(state_dict["bert"])

        self.encoder.to(self.device)
示例#2
0
    def initialize(self, resources: Resources, configs: HParams):
        self.resource = resources
        vocab_file = configs.vocab_file
        self.tokenizer = BERTTokenizer.load(vocab_file)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
        self.encoder.to(self.device)
示例#3
0
class BertBasedQueryCreator(QueryProcessor):
    r"""This processor searches relevant documents for a query"""

    # pylint: disable=useless-super-delegation
    def __init__(self) -> None:
        super().__init__()

    def initialize(self, resources: Resources, configs: Config):
        self.resource = resources
        self.config = configs

        self.device = torch.device("cuda" if torch.cuda.is_available()
                                   else "cpu")

        if "name" in self.config.tokenizer:
            self.tokenizer = \
                BERTTokenizer(pretrained_model_name=self.config.tokenizer.name)

        if "name" in self.config.model:
            self.encoder = BERTEncoder(
                pretrained_model_name=self.config.model.name)

        else:
            self.encoder = BERTEncoder(pretrained_model_name=None,
                                       hparams={"pretrained_model_name": None})
            with open(self.config.model.path, "rb") as f:
                state_dict = pickle.load(f)
            self.encoder.load_state_dict(state_dict["bert"])

        self.encoder.to(self.device)

    @classmethod
    def default_configs(cls) -> Dict[str, Any]:
        config = super().default_configs()
        config.update({
            "model": {
                'path': None,
                "name": "bert-base-uncased",
            },
            "tokenizer": {
                "name": "bert-base-uncased"
            },
            "max_seq_length": 128,
            "query_pack_name": "query"
        })
        return config

    @torch.no_grad()
    def get_embeddings(self, inputs, sequence_length, segment_ids):
        output, _ = self.encoder(inputs=inputs,
                                 sequence_length=sequence_length,
                                 segment_ids=segment_ids)
        cls_token = output[:, 0, :]

        return cls_token

    def _build_query(self, text: str) -> np.ndarray:
        input_ids, segment_ids, input_mask = \
            self.tokenizer.encode_text(
                text_a=text, max_seq_length=self.config.max_seq_length)
        input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(self.device)
        segment_ids = torch.LongTensor(segment_ids).unsqueeze(0).to(self.device)
        input_mask = torch.LongTensor(input_mask).unsqueeze(0).to(self.device)
        sequence_length = (~(input_mask == 0)).sum(dim=1)
        query_vector = self.get_embeddings(inputs=input_ids,
                                           sequence_length=sequence_length,
                                           segment_ids=segment_ids)
        query_vector = torch.mean(query_vector, dim=0, keepdim=True)
        query_vector = query_vector.cpu().numpy()
        return query_vector

    def _process_query(self, input_pack: MultiPack) \
            -> Tuple[DataPack, Dict[str, Any]]:
        query_pack: DataPack = input_pack.get_pack(self.config.query_pack_name)
        context = [query_pack.text]

        # use context to build the query
        if "user_utterance" in input_pack.pack_names:
            user_pack = input_pack.get_pack("user_utterance")
            context.append(user_pack.text)

        if "bot_utterance" in input_pack.pack_names:
            bot_pack = input_pack.get_pack("bot_utterance")
            context.append(bot_pack.text)

        text = ' '.join(context)

        query_vector = self._build_query(text=text)

        return query_pack, query_vector
示例#4
0
max_seq_length = 512

print(f"Processing the IMDB reviews...")
processor_class = get_processor_class("IMDB")
imdb_processor = processor_class("data/IMDB")
train_examples = imdb_processor.get_train_examples()
dev_examples = imdb_processor.get_dev_examples()
reviews = [
    example.text_a for dataset in [train_examples, dev_examples]
    for example in dataset
]

# create a BERT tokenizer
vocab_file = "data/pretrained_models/uncased_L-12_H-768_A-12/vocab.txt"
tokenizer = BERTTokenizer.load(vocab_file)

# BERT encoder
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
encoder.to(device)

print(f"Encoding the text using BERT Tokenizer...")
feature_original_types = {
    "id": ["int64", "FixedLenFeature"],
    "input_ids": ["int64", "FixedLenFeature", max_seq_length],
    "segment_ids": ["int64", "FixedLenFeature", max_seq_length],
    "text": ["str", "FixedLenFeature"]
}

with RecordData.writer("data/imdb.pkl", feature_original_types) as writer: