Exemplo n.º 1
0
 def __init__(self, config):
     super().__init__()
     self.attention = BertAttention(config)
     self.is_decoder = config.is_decoder
     if self.is_decoder:
         self.crossattention = BertAttention(config)
     self.intermediate = BertIntermediate(config)
     self.output = BertOutput(config)
Exemplo n.º 2
0
 def __init__(self, config):
     super().__init__()
     self.chunk_size_feed_forward = config.chunk_size_feed_forward
     self.seq_len_dim = 1
     self.attention = BertAttention(config)
     self.is_decoder = config.is_decoder
     self.add_cross_attention = config.add_cross_attention
     if self.add_cross_attention:
         assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
         self.crossattention = BertAttention(config)
     self.intermediate = BertIntermediate(config)
     self.output = BertOutput(config)
Exemplo n.º 3
0
    def __init__(self, config):
        super().__init__()
        # The cross-attention Layer
        self.visual_attention = BertCrossattLayer(config)

        # Self-attention Layers
        self.lang_self_att = BertAttention(config)
        self.visn_self_att = BertAttention(config)

        # Intermediate and Output Layers (FFNs)
        self.lang_inter = BertIntermediate(config)
        self.lang_output = BertOutput(config)
        self.visn_inter = BertIntermediate(config)
        self.visn_output = BertOutput(config)
Exemplo n.º 4
0
    def from_torch(attention: TorchBertAttention):
        params = {k: v for k, v in attention.named_parameters()}
        with torch.no_grad():
            # merge self.query.weight, self.query.weight and self.query.weight together as qkv.weight
            qkv_weight = torch.clone(
                torch.t(
                    torch.cat((params['self.query.weight'],
                               params['self.key.weight'],
                               params['self.value.weight']),
                              0).contiguous()).contiguous())
            qkv_bias = torch.cat(
                (params['self.query.bias'], params['self.key.bias'],
                 params['self.value.bias']), 0).contiguous()

            output_weight = torch.clone(
                torch.t(params['output.dense.weight']).contiguous())
            att = BertAttention(
                convert2tt_tensor(qkv_weight), convert2tt_tensor(qkv_bias),
                convert2tt_tensor(output_weight),
                convert2tt_tensor(params['output.dense.bias']),
                convert2tt_tensor(params['output.LayerNorm.weight']),
                convert2tt_tensor(params['output.LayerNorm.bias']),
                attention.self.num_attention_heads)

            return att
Exemplo n.º 5
0
    def __init__(self, config):
        super(GeneratingMasksAC, self).__init__(config)
        if config.model_type == 'bert':
            self.bert = BertModel(config=config)
        else:
            self.bert = None

        # Reload config (Since it's bert.., I think there is a way to modify
        # this more simple)
        if self.bert is not None:
            config = BertConfig.from_pretrained("bert-base-uncased")
        config.attention_probs_dropout_prob = 0.0
        config.hidden_dropout_prob = 0.0
        self.config = config
        self.transformer = BertAttention(config)

        self.policy1 = nn.Linear(config.hidden_size, 128)
        self.policy2 = nn.Linear(128, 1)

        # Value Part #
        self.value1 = nn.Linear(config.hidden_size, 128)
        self.value2 = nn.Linear(128, 1)

        #self.apply(self._init_weights)
        self.init_weights()
        def init_data(self, use_cuda):
            test_device = torch.device('cuda:0') if use_cuda else \
                   torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)
            self.cfg.output_attentions = True
            torch_attention = BertAttention(self.cfg)
            torch_attention.eval()
            if use_cuda:
                torch_attention.to(test_device)

            # Get FT Attention
            turbo_attention = turbo_transformers.BertAttention.from_torch(
                torch_attention)

            turbo_decoder_attention = turbo_transformers.MultiHeadedAttention.from_torch(
                torch_attention, is_trans_weight=False)

            hidden_size = self.cfg.hidden_size
            input_tensor = torch.rand(size=(batch_size, seq_length,
                                            hidden_size),
                                      dtype=torch.float32,
                                      device=test_device)
            attention_mask = torch.ones((batch_size, seq_length),
                                        dtype=torch.float32,
                                        device=test_device)
            attention_mask = attention_mask[:, None, None, :]
            attention_mask = (1.0 - attention_mask) * -10000.0
            return torch_attention, turbo_attention, turbo_decoder_attention, input_tensor, attention_mask
Exemplo n.º 7
0
    def __init__(self, config):
        super(BertLayerOracleSparse, self).__init__()
        logger.info(
            f"Set Oracle Sparse with key_c:{config.key_c} and query_c:{config.query_c}!"
        )

        self.attention = BertAttention(config)
        self.attention.self.output_attentions = True
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

        self.key_c = config.key_c
        self.query_c = config.query_c
        self.num_heads = config.num_attention_heads
        def init_attn_models(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)

            # torch model is from ONMT
            # self.torch_model = MultiHeadedAttention(self.cfg.num_attention_heads, self.cfg.hidden_size)
            self.torch_model = BertAttention(self.cfg)
            self.torch_model.eval()
            if use_cuda:
                self.torch_model.to(self.test_device)

            self.hidden_size = self.cfg.hidden_size

            # self.turbo_model = turbo_transformers.MultiHeadedAttentionSmartBatch.from_onmt(
            #     self.torch_model)
            self.turbo_model = turbo_transformers.MultiHeadedAttentionSmartBatch.from_torch(
                self.torch_model)
        def init_bert_models(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)

            self.torch_model = BertModel(self.cfg)
            self.torch_model.eval()
            if use_cuda:
                self.torch_model.to(self.test_device)

            self.hidden_size = self.cfg.hidden_size

            self.turbo_model = turbo_transformers.BertModelSmartBatch.from_torch(
                self.torch_model)
    class TestBertSmartBatch(unittest.TestCase):
        def init_bertlayer_models(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)

            self.torch_model = BertLayer(self.cfg)
            self.torch_model.eval()
            if use_cuda:
                self.torch_model.to(self.test_device)

            self.hidden_size = self.cfg.hidden_size

            self.turbo_model = turbo_transformers.BertLayerSmartBatch.from_torch(
                self.torch_model)

        def init_bert_models(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)

            self.torch_model = BertModel(self.cfg)
            self.torch_model.eval()
            if use_cuda:
                self.torch_model.to(self.test_device)

            self.hidden_size = self.cfg.hidden_size

            self.turbo_model = turbo_transformers.BertModelSmartBatch.from_torch(
                self.torch_model)

        def init_attn_models(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(1)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)

            # torch model is from ONMT
            # self.torch_model = MultiHeadedAttention(self.cfg.num_attention_heads, self.cfg.hidden_size)
            self.torch_model = BertAttention(self.cfg)
            self.torch_model.eval()
            if use_cuda:
                self.torch_model.to(self.test_device)

            self.hidden_size = self.cfg.hidden_size

            # self.turbo_model = turbo_transformers.MultiHeadedAttentionSmartBatch.from_onmt(
            #     self.torch_model)
            self.turbo_model = turbo_transformers.MultiHeadedAttentionSmartBatch.from_torch(
                self.torch_model)

        def init_inputs(self):
            # prepare torch input data
            self.input_list = []
            for query_seq_len in query_seq_len_list:
                Q = torch.rand(
                    size=(
                        1,
                        query_seq_len,  #from_seq
                        self.hidden_size),
                    dtype=torch.float32,
                    device=self.test_device)
                self.input_list.append(Q)

            # concat Qs together
            for i in range(len(query_seq_len_list)):
                if i == 0:
                    self.concat_Q = self.input_list[i]
                else:
                    self.concat_Q = torch.cat(
                        (self.concat_Q, self.input_list[i]), 1)

            self.assertTrue(self.concat_Q.size()[1] == sum(query_seq_len_list))

        def init_inputs_seq(self):
            # prepare torch input data
            self.input_list = []
            for query_seq_len in query_seq_len_list:
                input_seq = torch.randint(low=0,
                                          high=self.cfg.vocab_size - 1,
                                          size=(1, query_seq_len),
                                          dtype=torch.long,
                                          device=self.test_device)
                self.input_list.append(input_seq)

            # self.assertTrue(self.concat_Q.size()[1] == sum(query_seq_len_list))

        def check_bert_attn(self, use_cuda):
            self.init_attn_models(use_cuda)
            self.init_inputs()

            num_iter = 2
            device = "GPU" if use_cuda else "CPU"

            res_list = []
            for Q in self.input_list:
                # res, _ = self.torch_model(
                #     Q,
                #     Q,
                #     Q,
                #     mask=None,
                #     layer_cache=None,  #layer_cache_torch
                #     attn_type="self")
                # res_list.append(res)
                attention_mask = torch.ones((1, Q.size(1)),
                                            dtype=torch.float32,
                                            device=self.test_device)
                attention_mask = attention_mask[:, None, None, :]
                attention_mask = (1.0 - attention_mask) * -10000.0
                res = self.torch_model(Q, attention_mask=None)
                res_list.append(res[0])

            # concat res_list together
            for i in range(len(res_list)):
                if i == 0:
                    concat_res = res_list[i]
                else:
                    concat_res = torch.cat((concat_res, res_list[i]), 1)

            pad_result, _ = self.turbo_model(self.concat_Q,
                                             self.concat_Q,
                                             self.concat_Q,
                                             query_seq_len_list, [],
                                             mask=None,
                                             layer_cache=None,
                                             post_layernorm=True,
                                             attn_type="self")

            # Tensor core will introduce more errors
            tolerate_error = 1e-2 if use_cuda else 1e-3
            self.assertTrue(
                torch.max(torch.abs(concat_res - pad_result)) < tolerate_error)

        def check_bert_layer(self, use_cuda):
            self.init_bertlayer_models(use_cuda)
            self.init_inputs()

            num_iter = 2
            device = "GPU" if use_cuda else "CPU"

            res_list = []
            for Q in self.input_list:
                res, _ = self.torch_model(Q, None, output_attentions=True)
                res_list.append(res)

            # concat res_list together
            for i in range(len(res_list)):
                if i == 0:
                    concat_res = res_list[i]
                else:
                    concat_res = torch.cat((concat_res, res_list[i]), 1)

            pad_result, _ = self.turbo_model(self.concat_Q,
                                             query_seq_len_list,
                                             attention_mask=None,
                                             output_attentions=False)

            # Tensor core will introduce more errors
            tolerate_error = 1e-2 if use_cuda else 1e-3
            self.assertTrue(
                torch.max(torch.abs(concat_res - pad_result)) < tolerate_error)

            # self.assertTrue(
            #     torch.max(
            #         torch.abs(torch_bert_layer_result[1] -
            #                   turbo_bert_layer_result[1])) < tolerate_error)

            # with open(fname, "a") as fh:
            #     fh.write(
            #         f"\"({batch_size},{seq_length:03})\", {torch_qps}, {turbo_qps}\n"
            #     )

        def check_bert_model(self, use_cuda):
            self.init_bert_models(use_cuda)
            self.init_inputs_seq()

            num_iter = 2
            device = "GPU" if use_cuda else "CPU"

            # for reference
            res_list = []
            for Q in self.input_list:
                res, _ = self.torch_model(Q)
                res_list.append(res)

            for i in range(len(res_list)):
                if i == 0:
                    concat_res = res_list[i]
                else:
                    concat_res = torch.cat((concat_res, res_list[i]), 1)

            # turbo inference
            pad_result, _ = self.turbo_model(self.input_list,
                                             query_seq_len_list)

            # Tensor core will introduce more errors
            tolerate_error = 1e-2 if use_cuda else 1e-3
            self.assertTrue(
                torch.max(torch.abs(concat_res - pad_result)) < tolerate_error)

            # self.assertTrue(
            #     torch.max(
            #         torch.abs(torch_bert_layer_result[1] -
            #                   turbo_bert_layer_result[1])) < tolerate_error)

            # with open(fname, "a") as fh:
            #     fh.write(
            #         f"\"({batch_size},{seq_length:03})\", {torch_qps}, {turbo_qps}\n"
            #     )

        def test_bert(self):
            self.check_bert_model(use_cuda=False)
            self.check_bert_layer(use_cuda=False)
            self.check_bert_attn(use_cuda=False)
            if torch.cuda.is_available() and \
                    turbo_transformers.config.is_compiled_with_cuda():
                self.check_bert_model(use_cuda=True)
                self.check_bert_layer(use_cuda=True)
                self.check_bert_attn(use_cuda=True)
 def __init__(self, config):
     super(BertScanLayer, self).__init__()
     self.attention = BertAttention(config)
     self.scan_attention = BertScanAttention(config)
     self.intermediate = BertIntermediate(config)
     self.output = BertOutput(config)
Exemplo n.º 12
0
    def from_torch(attention: TorchBertAttention,
                   layer_norm: Optional[TorchLayerNorm] = None,
                   is_trans_weight: bool = False):
        """
        load an attn model from huggingface bert attention model.
        """
        ln_params = {}
        if layer_norm is not None:
            ln_params = {k: v for k, v in layer_norm.named_parameters()}
        params = {k: v for k, v in attention.named_parameters()}
        with torch.no_grad():
            if is_trans_weight:
                # merge self.query.weight, self.query.weight and self.query.weight together as qkv.weight
                qkv_weight = torch.cat(
                    (params['self.query.weight'], params['self.key.weight'],
                     params['self.value.weight']), 0)
                output_weight = params['output.dense.weight']
                k_w = params['self.key.weight']
                v_w = params['self.value.weight']
                q_w = params['self.query.weight']
            else:
                # merge self.query.weight, self.query.weight and self.query.weight together as qkv.weight
                qkv_weight = torch.clone(
                    torch.t(
                        torch.cat((params['self.query.weight'],
                                   params['self.key.weight'],
                                   params['self.value.weight']),
                                  0).contiguous()).contiguous())
                output_weight = torch.clone(
                    torch.t(params['output.dense.weight']).contiguous())
                k_w = torch.clone(
                    torch.t(params['self.key.weight']).contiguous())
                v_w = torch.clone(
                    torch.t(params['self.value.weight']).contiguous())
                q_w = torch.clone(
                    torch.t(params['self.query.weight']).contiguous())

            qkv_bias = torch.cat(
                (params['self.query.bias'], params['self.key.bias'],
                 params['self.value.bias']), 0)

            if layer_norm is not None:
                att = MultiHeadedAttention(
                    convert2tt_tensor(k_w),
                    convert2tt_tensor(params['self.key.bias']),
                    convert2tt_tensor(v_w),
                    convert2tt_tensor(params['self.value.bias']),
                    convert2tt_tensor(q_w),
                    convert2tt_tensor(params['self.query.bias']),
                    convert2tt_tensor(output_weight),
                    convert2tt_tensor(params['output.dense.bias']),
                    convert2tt_tensor(qkv_weight), convert2tt_tensor(qkv_bias),
                    convert2tt_tensor(params['output.LayerNorm.weight']),
                    convert2tt_tensor(params['output.LayerNorm.bias']),
                    convert2tt_tensor(ln_params['weight']),
                    convert2tt_tensor(ln_params['bias']),
                    attention.self.num_attention_heads)
            else:
                att = MultiHeadedAttention(
                    convert2tt_tensor(k_w),
                    convert2tt_tensor(params['self.key.bias']),
                    convert2tt_tensor(v_w),
                    convert2tt_tensor(params['self.value.bias']),
                    convert2tt_tensor(q_w),
                    convert2tt_tensor(params['self.query.bias']),
                    convert2tt_tensor(output_weight),
                    convert2tt_tensor(params['output.dense.bias']),
                    convert2tt_tensor(qkv_weight), convert2tt_tensor(qkv_bias),
                    convert2tt_tensor(params['output.LayerNorm.weight']),
                    convert2tt_tensor(params['output.LayerNorm.bias']),
                    attention.self.num_attention_heads)
            return att