示例#1
0
    def _bert_encode_article(self,
                             max_seq_length=128,
                             sequence_a_segment_id=0,
                             sequence_b_segment_id=1,
                             cls_token_segment_id=1,
                             pad_token_segment_id=0,
                             mask_padding_with_zero=True):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                  do_lower_case=True)
        bert_config = BertConfig.from_pretrained('bert-base-uncased')
        model = BertModel(bert_config)

        all_input_ids, all_input_mask, all_segment_ids = [], [], []
        for header, article in zip(self.df_url['header'],
                                   self.df_url['article']):
            text = header + '. ' + article
            tokens = tokenizer.tokenize(text)
            special_tokens_count = 2
            if len(tokens) > max_seq_length - special_tokens_count:
                tokens = tokens[:(max_seq_length - special_tokens_count)]
            segment_ids = [sequence_a_segment_id] * len(tokens)
            tokens = [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
            segment_ids = [cls_token_segment_id
                           ] + segment_ids + [sequence_a_segment_id]
            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Padding
            padding_length = max_seq_length - len(input_ids)
            pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                         ])[0]
            input_ids = input_ids + ([pad_token] * padding_length)
            input_mask = input_mask + [0] * padding_length
            segment_ids = segment_ids + ([pad_token_segment_id] *
                                         padding_length)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length
            all_input_ids.append(input_ids)
            all_input_mask.append(input_mask)
            all_segment_ids.append(segment_ids)

        all_input_ids = torch.tensor(all_input_ids)
        all_input_mask = torch.tensor(all_input_mask)
        all_segment_ids = torch.tensor(all_segment_ids)

        model.eval()
        outputs = model(all_input_ids,
                        attention_mask=all_input_mask,
                        token_type_ids=all_segment_ids)
        embedding = outputs[1].data.numpy()
        del model
        return embedding
示例#2
0
class TestBertModel(unittest.TestCase):
    def init_data(self, use_cuda) -> None:
        torch.set_grad_enabled(False)
        torch.set_num_threads(4)
        turbo_transformers.set_num_threads(4)
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')

        self.cfg = BertConfig()
        self.torch_model = BertModel(self.cfg)
        self.torch_model.eval()

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)

        self.turbo_model = turbo_transformers.BertModel.from_torch(
            self.torch_model, self.test_device)

    def check_torch_and_turbo(self, use_cuda):
        self.init_data(use_cuda)
        num_iter = 1
        device_name = "GPU" if use_cuda else "CPU"
        input_ids = torch.randint(low=0,
                                  high=self.cfg.vocab_size - 1,
                                  size=(1, 10),
                                  dtype=torch.long,
                                  device=self.test_device)

        torch_model = lambda: self.torch_model(input_ids)
        torch_result, torch_qps, torch_time = \
            test_helper.run_model(torch_model, use_cuda, num_iter)
        print(f'BertModel PyTorch({device_name}) QPS {torch_qps}')

        turbo_model = (lambda: self.turbo_model(input_ids))

        with turbo_transformers.pref_guard("bert_perf") as perf:
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)
        print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}')

        self.assertTrue(
            numpy.allclose(torch_result[0][:, 0].cpu(),
                           turbo_result[0].cpu(),
                           atol=1e-3,
                           rtol=1e-3))

    def test_bert_model(self):
        if torch.cuda.is_available() and \
            turbo_transformers.config.is_compiled_with_cuda():
            self.check_torch_and_turbo(use_cuda=True)
        self.check_torch_and_turbo(use_cuda=False)
class BertEmbed:
    def __init__(self):
        config = BertConfig.from_json_file(join(BERT_PATH, 'bert_config.json'))
        self.tokenizer = BertTokenizer(vocab_file=join(BERT_PATH, 'vocab.txt'))
        self.model = BertModel(config, add_pooling_layer=False)
        load_tf_weights_in_bert(self.model,
                                tf_checkpoint_path=join(
                                    BERT_PATH, 'bert_model.ckpt'),
                                strip_bert=True)
        self.model.to(PT_DEVICE)
        self.model.eval()

    def get_embedding(self, sentences):
        x = self.tokenizer(sentences, return_tensors='pt',
                           padding=True).to(PT_DEVICE)
        with torch.no_grad():
            output = self.model(**x)[0]
        return output.cpu().numpy()
class TestBertModel(unittest.TestCase):
    def init_data(self, use_cuda) -> None:
        torch.set_grad_enabled(False)
        torch.set_num_threads(4)
        turbo_transformers.set_num_threads(4)
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')

        self.cfg = BertConfig()
        self.torch_model = BertModel(self.cfg)
        self.torch_model.eval()

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)

        self.turbo_model = turbo_transformers.BertModel.from_torch(
            self.torch_model, self.test_device, "turbo")

    def check_torch_and_turbo(self,
                              use_cuda,
                              batch_size,
                              seq_len,
                              use_memory_opt=True):
        self.init_data(use_cuda)
        num_iter = 1
        device_name = "GPU" if use_cuda else "CPU"
        input_ids = torch.randint(low=0,
                                  high=self.cfg.vocab_size - 1,
                                  size=(batch_size, seq_len),
                                  dtype=torch.long,
                                  device=self.test_device)

        torch_model = lambda: self.torch_model(input_ids)
        torch_result, torch_qps, torch_time = \
            test_helper.run_model(torch_model, use_cuda, num_iter)
        print(f'BertModel PyTorch({device_name}) QPS {torch_qps}')

        turbo_model = (lambda: self.turbo_model(input_ids))

        if use_memory_opt:
            turbo_transformers.bert_opt_mem_allocate_api(
                input_ids.size()[0],  # batch
                input_ids.size()[1],  # seq_len
                self.cfg.num_attention_heads,
                self.cfg.hidden_size,
                self.cfg.num_hidden_layers,
                "GPU" if 'cuda' in input_ids.device.type else "CPU")

        with turbo_transformers.pref_guard("bert_perf") as perf:
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)
        print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}')

        print(f"batch {batch_size} seq_len {seq_len}")
        print(torch.max(torch_result[0].cpu() - turbo_result[0].cpu()))
        self.assertTrue(
            numpy.allclose(torch_result[0].cpu(),
                           turbo_result[0].cpu(),
                           atol=1e-2,
                           rtol=1e-3))

    def bert_model_test_helper(self, use_memory_opt=False):
        if use_memory_opt:
            turbo_transformers.reset_allocator_schema("model-aware")

        for batch_size in [2, 4, 1]:
            for seq_len in [50, 4, 16]:
                if torch.cuda.is_available() and \
                        turbo_transformers.config.is_compiled_with_cuda():
                    self.check_torch_and_turbo(use_cuda=True,
                                               batch_size=batch_size,
                                               seq_len=seq_len,
                                               use_memory_opt=use_memory_opt)
                self.check_torch_and_turbo(use_cuda=False,
                                           batch_size=batch_size,
                                           seq_len=seq_len,
                                           use_memory_opt=use_memory_opt)

        if use_memory_opt:
            turbo_transformers.reset_allocator_schema("naive")

    def test_bert_model(self):
        # self.bert_model_test_helper(True)
        self.bert_model_test_helper(False)
class TestBertModel(unittest.TestCase):
    def init_data(self, use_cuda) -> None:
        torch.set_grad_enabled(False)
        torch.set_num_threads(1)
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')

        self.cfg = BertConfig()
        self.torch_model = BertModel(self.cfg)
        self.torch_model.eval()

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)

        self.turbo_model = turbo_transformers.BertModel.from_torch(
            self.torch_model, self.test_device)

        self.turbo_pooler_model = turbo_transformers.BertModelWithPooler.from_torch(
            self.torch_model, self.test_device)

    def check_torch_and_turbo(self, use_cuda, use_pooler):
        self.init_data(use_cuda)
        num_iter = 2
        device_name = "GPU" if use_cuda else "CPU"
        input_ids = torch.randint(low=0,
                                  high=self.cfg.vocab_size - 1,
                                  size=(2, 32),
                                  dtype=torch.long,
                                  device=self.test_device)

        torch_model = lambda: self.torch_model(input_ids)
        torch_result, torch_qps, torch_time = \
            test_helper.run_model(torch_model, use_cuda, num_iter)
        print(f'BertModel Plain PyTorch({device_name}) QPS {torch_qps}')

        turbo_model = (
            lambda: self.turbo_pooler_model(input_ids)) if use_pooler else (
                lambda: self.turbo_model(input_ids))
        turbo_result, turbo_qps, turbo_time = \
            test_helper.run_model(turbo_model, use_cuda, num_iter)
        print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}')

        torch_result_final = (torch_result[1]).cpu().numpy(
        ) if use_pooler else torch_result[0][:, 0].cpu().numpy()

        turbo_result_final = turbo_result[0].cpu().numpy()

        #TODO(jiaruifang, v_cshi) check why pooler introduce more difference
        if use_pooler:
            print(
                "encode output diff: ",
                numpy.max((torch_result[0][:, 0]).cpu().numpy() -
                          turbo_result[1].cpu().numpy()).reshape(-1))
            print(
                "pooler output diff: ",
                numpy.max(
                    (turbo_result_final - torch_result_final).reshape(-1)))
        (atol, rtol) = (1e-2, 1e-2) if use_pooler else (5e-3, 1e-4)

        self.assertTrue(
            numpy.allclose(torch_result_final,
                           turbo_result_final,
                           atol=atol,
                           rtol=rtol))

    def test_bert_model(self):
        if torch.cuda.is_available() and \
            turbo_transformers.config.is_compiled_with_cuda():
            self.check_torch_and_turbo(use_cuda=True, use_pooler=False)
            self.check_torch_and_turbo(use_cuda=True, use_pooler=True)
        self.check_torch_and_turbo(use_cuda=False, use_pooler=False)
        self.check_torch_and_turbo(use_cuda=False, use_pooler=True)