def _bert_encode_article(self, max_seq_length=128, sequence_a_segment_id=0, sequence_b_segment_id=1, cls_token_segment_id=1, pad_token_segment_id=0, mask_padding_with_zero=True): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) bert_config = BertConfig.from_pretrained('bert-base-uncased') model = BertModel(bert_config) all_input_ids, all_input_mask, all_segment_ids = [], [], [] for header, article in zip(self.df_url['header'], self.df_url['article']): text = header + '. ' + article tokens = tokenizer.tokenize(text) special_tokens_count = 2 if len(tokens) > max_seq_length - special_tokens_count: tokens = tokens[:(max_seq_length - special_tokens_count)] segment_ids = [sequence_a_segment_id] * len(tokens) tokens = [tokenizer.cls_token] + tokens + [tokenizer.sep_token] segment_ids = [cls_token_segment_id ] + segment_ids + [sequence_a_segment_id] input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Padding padding_length = max_seq_length - len(input_ids) pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0] input_ids = input_ids + ([pad_token] * padding_length) input_mask = input_mask + [0] * padding_length segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length all_input_ids.append(input_ids) all_input_mask.append(input_mask) all_segment_ids.append(segment_ids) all_input_ids = torch.tensor(all_input_ids) all_input_mask = torch.tensor(all_input_mask) all_segment_ids = torch.tensor(all_segment_ids) model.eval() outputs = model(all_input_ids, attention_mask=all_input_mask, token_type_ids=all_segment_ids) embedding = outputs[1].data.numpy() del model return embedding
class TestBertModel(unittest.TestCase): def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = BertConfig() self.torch_model = BertModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.BertModel.from_torch( self.torch_model, self.test_device) def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda) num_iter = 1 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(1, 10), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'BertModel PyTorch({device_name}) QPS {torch_qps}') turbo_model = (lambda: self.turbo_model(input_ids)) with turbo_transformers.pref_guard("bert_perf") as perf: turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}') self.assertTrue( numpy.allclose(torch_result[0][:, 0].cpu(), turbo_result[0].cpu(), atol=1e-3, rtol=1e-3)) def test_bert_model(self): if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True) self.check_torch_and_turbo(use_cuda=False)
class BertEmbed: def __init__(self): config = BertConfig.from_json_file(join(BERT_PATH, 'bert_config.json')) self.tokenizer = BertTokenizer(vocab_file=join(BERT_PATH, 'vocab.txt')) self.model = BertModel(config, add_pooling_layer=False) load_tf_weights_in_bert(self.model, tf_checkpoint_path=join( BERT_PATH, 'bert_model.ckpt'), strip_bert=True) self.model.to(PT_DEVICE) self.model.eval() def get_embedding(self, sentences): x = self.tokenizer(sentences, return_tensors='pt', padding=True).to(PT_DEVICE) with torch.no_grad(): output = self.model(**x)[0] return output.cpu().numpy()
class TestBertModel(unittest.TestCase): def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = BertConfig() self.torch_model = BertModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.BertModel.from_torch( self.torch_model, self.test_device, "turbo") def check_torch_and_turbo(self, use_cuda, batch_size, seq_len, use_memory_opt=True): self.init_data(use_cuda) num_iter = 1 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'BertModel PyTorch({device_name}) QPS {torch_qps}') turbo_model = (lambda: self.turbo_model(input_ids)) if use_memory_opt: turbo_transformers.bert_opt_mem_allocate_api( input_ids.size()[0], # batch input_ids.size()[1], # seq_len self.cfg.num_attention_heads, self.cfg.hidden_size, self.cfg.num_hidden_layers, "GPU" if 'cuda' in input_ids.device.type else "CPU") with turbo_transformers.pref_guard("bert_perf") as perf: turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}') print(f"batch {batch_size} seq_len {seq_len}") print(torch.max(torch_result[0].cpu() - turbo_result[0].cpu())) self.assertTrue( numpy.allclose(torch_result[0].cpu(), turbo_result[0].cpu(), atol=1e-2, rtol=1e-3)) def bert_model_test_helper(self, use_memory_opt=False): if use_memory_opt: turbo_transformers.reset_allocator_schema("model-aware") for batch_size in [2, 4, 1]: for seq_len in [50, 4, 16]: if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True, batch_size=batch_size, seq_len=seq_len, use_memory_opt=use_memory_opt) self.check_torch_and_turbo(use_cuda=False, batch_size=batch_size, seq_len=seq_len, use_memory_opt=use_memory_opt) if use_memory_opt: turbo_transformers.reset_allocator_schema("naive") def test_bert_model(self): # self.bert_model_test_helper(True) self.bert_model_test_helper(False)
class TestBertModel(unittest.TestCase): def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(1) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = BertConfig() self.torch_model = BertModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.BertModel.from_torch( self.torch_model, self.test_device) self.turbo_pooler_model = turbo_transformers.BertModelWithPooler.from_torch( self.torch_model, self.test_device) def check_torch_and_turbo(self, use_cuda, use_pooler): self.init_data(use_cuda) num_iter = 2 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(2, 32), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'BertModel Plain PyTorch({device_name}) QPS {torch_qps}') turbo_model = ( lambda: self.turbo_pooler_model(input_ids)) if use_pooler else ( lambda: self.turbo_model(input_ids)) turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}') torch_result_final = (torch_result[1]).cpu().numpy( ) if use_pooler else torch_result[0][:, 0].cpu().numpy() turbo_result_final = turbo_result[0].cpu().numpy() #TODO(jiaruifang, v_cshi) check why pooler introduce more difference if use_pooler: print( "encode output diff: ", numpy.max((torch_result[0][:, 0]).cpu().numpy() - turbo_result[1].cpu().numpy()).reshape(-1)) print( "pooler output diff: ", numpy.max( (turbo_result_final - torch_result_final).reshape(-1))) (atol, rtol) = (1e-2, 1e-2) if use_pooler else (5e-3, 1e-4) self.assertTrue( numpy.allclose(torch_result_final, turbo_result_final, atol=atol, rtol=rtol)) def test_bert_model(self): if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True, use_pooler=False) self.check_torch_and_turbo(use_cuda=True, use_pooler=True) self.check_torch_and_turbo(use_cuda=False, use_pooler=False) self.check_torch_and_turbo(use_cuda=False, use_pooler=True)