예제 #1
0
    def check_torch_and_turbo(self, use_cuda):
        self.init_data(use_cuda)
        num_iter = 1
        device_name = "GPU" if use_cuda else "CPU"
        input_ids = torch.randint(low=0,
                                  high=self.cfg.vocab_size - 1,
                                  size=(1, 10),
                                  dtype=torch.long,
                                  device=self.test_device)

        torch_model = lambda: self.torch_model(input_ids)
        torch_result, torch_qps, torch_time = \
            test_helper.run_model(torch_model, use_cuda, num_iter)
        print(f'GPT2Model PyTorch({device_name}) QPS {torch_qps}')

        turbo_model = (lambda: self.turbo_model(input_ids))

        turbo_result, turbo_qps, turbo_time = \
            test_helper.run_model(turbo_model, use_cuda, num_iter)
        print(f'GPT2Model TurboTransformer({device_name}) QPS {turbo_qps}')

        self.assertTrue(
            numpy.allclose(torch_result[0].cpu(),
                           turbo_result[0].cpu(),
                           atol=1e-3,
                           rtol=1e-3))
예제 #2
0
        def check_torch_and_turbo(self, use_cuda):
            self.init_data(use_cuda=use_cuda)
            device = "GPU" if use_cuda else "CPU"
            num_iter = 1
            turbo_model = lambda: self.turbo_model(
                self.input_tensor, attention_mask=None, head_mask=None)
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)

            print(
                f"AlbertLayer \"({batch_size},{seq_length:03})\" ",
                f"{device} TurboTransform QPS,  {turbo_qps}, time, {turbo_time}"
            )
            torch_model = lambda: self.torch_model(input_ids=self.input_tensor,
                                                   attention_mask=None,
                                                   head_mask=None)
            with turbo_transformers.pref_guard("albert_perf") as perf:
                torch_result, torch_qps, torch_time = \
                    test_helper.run_model(torch_model, use_cuda, num_iter)

            print(f"AlbertModel \"({batch_size},{seq_length:03})\" ",
                  f"{device} Torch QPS,  {torch_qps}, time, {torch_time}")

            # print(turbo_result[-1])
            # print(turbo_result, torch_result[0])
            # TODO(jiaruifang) Error is too high. Does tensor core introduce more differences?
            tolerate_error = 1e-2
            self.assertTrue(
                torch.max(torch.abs(torch_result[0] -
                                    turbo_result[0])) < tolerate_error)

            with open("albert_model_res.txt", "a") as fh:
                fh.write(
                    f"\"({batch_size},{seq_length:03})\", {torch_qps}, {torch_qps}\n"
                )
    def check_torch_and_turbo(self, use_cuda):
        self.init_data(use_cuda)
        num_iter = 20
        device_name = "GPU" if use_cuda else "CPU"
        input_ids = torch.randint(low=0,
                                  high=self.cfg.vocab_size - 1,
                                  size=(1, 10),
                                  dtype=torch.long,
                                  device=self.test_device)

        torch_model = lambda: self.torch_model(input_ids)
        torch_result, torch_qps, torch_time = \
            test_helper.run_model(torch_model, use_cuda, num_iter)
        print(f'RobertaModel PyTorch({device_name}) QPS {torch_qps}')

        turbo_model = (lambda: self.turbo_model(input_ids))
        with turbo_transformers.pref_guard("roberta_perf") as perf:
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)
        print(f'RobertaModel TurboTransformer({device_name}) QPS {turbo_qps}')

        torch_result_final = torch_result[0].cpu().numpy()

        turbo_result_final = turbo_result[0].cpu().numpy()
        # print(numpy.size(torch_result_final), numpy.size(turbo_result_final))
        # print(torch_result_final - turbo_result_final)
        self.assertTrue(
            numpy.allclose(torch_result_final,
                           turbo_result_final,
                           atol=1e-3,
                           rtol=1e-3))
예제 #4
0
        def check_torch_and_turbo(self, use_cuda, num_iter=1):
            self.init_data(use_cuda)
            device = "GPU" if use_cuda else "CPU"
            torch_model = lambda: self.torch_sa_layer_norm(
                self.torch_attention(query=self.input_tensor,
                                     key=self.input_tensor,
                                     value=self.input_tensor,
                                     mask=self.attention_mask,
                                     output_attentions=False)[0] + self.
                input_tensor)
            torch_attention_result, torch_qps, torch_time_consume = \
                test_helper.run_model(torch_model, use_cuda, num_iter, use_profile=False)
            print(
                f"DistilAttention+LN \"({batch_size},{seq_length:03})\" ",
                f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}")

            turbo_model = lambda: self.turbo_attention(
                self.input_tensor,
                self.attention_mask,
                output_attentions=self.cfg.output_attentions)[0]

            turbo_attention_result, turbo_qps, turbo_time_consume = \
                test_helper.run_model(turbo_model, use_cuda,
                                      num_iter)
            print(
                f"DistilAttention \"({batch_size},{seq_length:03})\" ",
                f" {device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}"
            )

            self.assertTrue(
                torch.max(
                    torch.abs(torch_attention_result - turbo_attention_result))
                < (1e-3 if use_cuda else 1e-4))
예제 #5
0
        def check_torch_and_turbo(self, use_cuda):
            self.init_data(use_cuda)
            num_iter = 2
            device = "GPU" if use_cuda else "CPU"
            torch_model = lambda: self.torch_bert_layer(
                self.input_tensor, self.attention_mask)
            torch_bert_layer_result, torch_qps, torch_time = \
                test_helper.run_model(torch_model, use_cuda, num_iter)
            print(f"BertLayer \"({batch_size},{seq_length:03})\" ",
                  f"{device} Torch QPS,  {torch_qps}, time, {torch_time}")

            turbo_model = lambda: self.turbo_bert_layer(
                self.input_tensor, self.attention_mask)
            turbo_bert_layer_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)
            print(
                f"BertLayer \"({batch_size},{seq_length:03})\"  ",
                f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}"
            )

            # Tensor core will introduce more errors
            tolerate_error = 1e-2 if use_cuda else 1e-3
            self.assertTrue(
                torch.max(
                    torch.abs(torch_bert_layer_result[0] -
                              turbo_bert_layer_result)) < tolerate_error)
            with open(fname, "a") as fh:
                fh.write(
                    f"\"({batch_size},{seq_length:03})\", {torch_qps}, {turbo_qps}\n"
                )
예제 #6
0
        def check_torch_and_turbo(self, use_cuda):
            self.init_data(use_cuda)
            sio = io.StringIO()
            num_iter = 2
            device = "GPU" if use_cuda else "CPU"

            torch_model = lambda: self.torch_bertout(self.intermediate_output,
                                                     self.attention_output)
            torch_result, torch_qps, torch_time = \
                test_helper.run_model(torch_model, use_cuda, num_iter)
            print(f'Bert Output Plain PyTorch({device}) QPS {torch_qps}',
                  file=sio)

            turbo_model = lambda: self.turbo_bertout(self.intermediate_output,
                                                     self.attention_output)
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)
            print(
                f'Bert Output Plain TurboTransformer({device}) QPS {turbo_qps}',
                file=sio)

            # cuda version precision is lower due to tensor-core
            self.assertTrue(
                torch.max(torch.abs(torch_result - turbo_result)) < 1e-2
                if use_cuda else 1e-4)

            sio.seek(0)
            with open(f"gpu_bert_output_qps_{batch_size}_{seq_length:03}.txt",
                      "w") as of:
                for line in sio:
                    print(line.strip(), file=of)
    def check_torch_and_turbo(self, use_cuda=True):
        self.init_data(use_cuda=use_cuda)
        self.num_iter = 2

        turbo_bert_layer_result = None
        turbo_model = lambda: self.turbo_bert_encoder(
            self.input_tensor,
            self.attention_mask,
            output=turbo_bert_layer_result,
            return_type=turbo_transformers.ReturnType.turbo_transformers)

        turbo_bert_layer_result, turbo_qps, turbo_time_consume = \
            test_helper.run_model(turbo_model, use_cuda, self.num_iter)

        print(f"BertEncoder TurboTransform QPS, {turbo_qps}, ",
              f"Time Cost, {turbo_time_consume}")

        turbo_bert_layer_result = self.turbo_bert_encoder(
            self.input_tensor, self.attention_mask)

        torch_model = lambda: self.torch_encoder_layer(
            self.input_tensor, self.attention_mask, [None] * self.cfg.
            num_hidden_layers)

        torch_bert_layer_result, torch_qps, torch_time_consume = \
            test_helper.run_model(torch_model, use_cuda, self.num_iter)

        print(f"BertEncoder Torch QPS, {torch_qps}, ",
              f"Time Cost, {torch_time_consume}")

        diff = torch.abs(torch_bert_layer_result[0] - turbo_bert_layer_result)
        self.assertTrue(torch.max(diff) < 1e-3)
예제 #8
0
        def check_torch_and_turbo(self, use_cuda, num_iter=2):
            torch_attention, turbo_attention, input_tensor, attention_mask = \
                self.init_data(use_cuda)
            device = "GPU" if use_cuda else "CPU"
            torch_model = lambda: torch_attention(input_tensor, attention_mask)
            torch_attention_result, torch_qps, torch_time_consume = \
                test_helper.run_model(torch_model, use_cuda, num_iter)
            print(
                f"AlbertAttention \"({batch_size},{seq_length:03})\" ",
                f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}")

            turob_model = lambda: turbo_attention(input_tensor, attention_mask)
            turbo_self_attention_result, turbo_qps, turbo_time_consume = \
                test_helper.run_model(turob_model, use_cuda,
                                      num_iter)
            print(
                f"AlbertAttention \"({batch_size},{seq_length:03})\" ",
                f" {device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}"
            )

            self.assertTrue(
                torch.max(
                    torch.abs(torch_attention_result[0] -
                              turbo_self_attention_result)) < 1e-3
                if use_cuda else 1e-4)
            with open(fname, "a") as fh:
                fh.write(
                    f"\"({batch_size},{seq_length:03})\", {torch_qps}, {turbo_qps}\n"
                )
예제 #9
0
        def check_torch_and_turbo(self, use_cuda, num_iter=1):
            self.init_data(use_cuda)
            device = "GPU" if use_cuda else "CPU"

            torch_model = lambda: self.output_layer_norm(
                self.torch_ffn(self.inputs) + self.inputs)
            torch_res, torch_qps, torch_time_consume = \
                test_helper.run_model(torch_model, use_cuda, num_iter)

            print(
                f"DistrillFFN \"({batch_size}, {input_len:03})\" ",
                f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}")

            turbo_res = lambda: self.turbo_ffn(self.inputs,
                                               is_trans_weight=True)
            with turbo_transformers.pref_guard("gpref_test") as perf:
                turbo_res, turbo_qps, turbo_time_consume = \
                    test_helper.run_model(turbo_res, use_cuda, num_iter)

            print(
                f"DistrillFFN \"({batch_size}, {input_len:03})\" ",
                f"{device} Turbo Trans QPS, {turbo_qps}, time, {turbo_time_consume}"
            )

            print(torch.max(torch.abs(torch_res - turbo_res)))
            self.assertTrue(torch.max(torch.abs(torch_res - turbo_res)) < 1e-3)

            with open(fname, "a") as fh:
                fh.write(
                    f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps}\n"
                )
        def check_torch_and_turbo(self, use_cuda, num_iter=1):
            deivce_type = "GPU" if use_cuda else "CPU"
            info = f"\"({deivce_type}, {batch_size}, {src_length}, {T})\""

            self.init_data(use_cuda=use_cuda)

            self.inputs = torch.rand(
                                    batch_size,
                                    src_length,
                                    self.model_dim,
                                    dtype=torch.float32,
                                    device=self.test_device)

            self.mask = torch.randint(-100, 0, (batch_size, 1, src_length), dtype=torch.int64, device=self.test_device)

            onmt_mask = self.mask > 0

            onmt_model = lambda: self.onmt_encoder(self.inputs, onmt_mask)

            onmt_result, torch_qps, torch_time_consume = \
                test_helper.run_model(onmt_model, use_cuda, num_iter)

            print(
                f"ONMT Encoder {info} ",
                f"{deivce_type} QPS, {torch_qps}, time, {torch_time_consume}")

            if with_quantize_dynamic and not use_cuda:
                quantized_onmt_model = lambda: self.quantized_onmt_encoder(
                    self.inputs,onmt_mask)

                quantized_onmt_result, quantized_torch_qps, quantized_torch_time_consume = \
                    test_helper.run_model(quantized_onmt_model, use_cuda, num_iter)

                print(
                    f"ONMT Quantized Encoder {info} ",
                    f"{deivce_type} QPS, {quantized_torch_qps}, time, {quantized_torch_time_consume}"
                )

            turbo_model = lambda: self.turbo_encoder(self.inputs, onmt_mask)

            with turbo_transformers.pref_guard(info) as perf:
                turbo_result, turbo_qps, turbo_time_consume = \
                    test_helper.run_model(turbo_model, use_cuda, num_iter)

            print(
                f"Turbo Encoder {info} ",
                f"{deivce_type} QPS, {turbo_qps}, time, {turbo_time_consume}")

            print(f"diff max {torch.max(torch.abs(onmt_result - turbo_result))}")
            self.assertTrue(
                torch.max(torch.abs(onmt_result - turbo_result)) < (1e-3 if use_cuda else 1e-4))

            if with_quantize_dynamic and not use_cuda:
                with open(fname, "a") as fh:
                    fh.write(
                        f"{info} {torch_qps}, {quantized_torch_qps}, {turbo_qps}\n"
                    )
            else:
                with open(fname, "a") as fh:
                    fh.write(f"{info} {torch_qps}, {turbo_qps}\n")
예제 #11
0
        def check_torch_and_turbo(self, use_cuda, num_iter=1):
            self.init_data(use_cuda)
            device = "GPU" if use_cuda else "CPU"

            torch_model = lambda: self.torch_model(self.inputs, self.
                                                   attention_mask)
            torch_res, torch_qps, torch_time_consume = \
                test_helper.run_model(torch_model, use_cuda, num_iter)

            print(
                f"DistillBertModel \"({batch_size}, {input_len:03})\" ",
                f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}")

            turbo_res = lambda: self.turbo_transformer(
                self.inputs, self.attention_mask, head_mask=self.head_mask)
            with turbo_transformers.pref_guard("gpref_test") as perf:
                turbo_res, turbo_qps, turbo_time_consume = \
                    test_helper.run_model(turbo_res, use_cuda, num_iter)

            print(
                f"DistillBertModel \"({batch_size}, {input_len:03})\" ",
                f"{device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}")

            self.assertTrue(
                torch.max(torch.abs(torch_res[0] - turbo_res[0])) < 1e-2
                if use_cuda else 1e-3)

            with open(fname, "a") as fh:
                fh.write(
                    f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps}\n"
                )
예제 #12
0
        def check_torch_and_turbo(self, use_cuda, num_iter=1):
            torch_attention, turbo_attention, turbo_decoder_attention, input_tensor, attention_mask = \
                self.init_data(use_cuda)
            device = "GPU" if use_cuda else "CPU"
            torch_model = lambda: torch_attention(input_tensor, attention_mask)
            torch_attention_result, torch_qps, torch_time_consume = \
                test_helper.run_model(torch_model, use_cuda, num_iter)
            print(
                f"BertAttention \"({batch_size},{seq_length:03})\" ",
                f"{device} Torch QPS, {torch_qps}, time, {torch_time_consume}")

            turbo_model = lambda: turbo_attention(input_tensor, attention_mask)
            turbo_attention_result, turbo_qps, turbo_time_consume = \
                test_helper.run_model(turbo_model, use_cuda,
                                      num_iter)
            print(
                f"BertAttention \"({batch_size},{seq_length:03})\" ",
                f" {device} Turbo QPS, {turbo_qps}, time, {turbo_time_consume}"
            )

            self.assertTrue(
                torch.max(
                    torch.abs(torch_attention_result[0] -
                              turbo_attention_result[0])) < (
                                  1e-3 if use_cuda else 1e-4))

            turbo_multiheaded_model = lambda: turbo_decoder_attention(
                input_tensor,
                input_tensor,
                input_tensor,
                attention_mask,
                layer_cache=None,
                attn_type="self",
                pre_layernorm=False,
                post_layernorm=True,
                post_add_input=False,
                is_trans_weight=False)
            turbo_decoder_attn_result, turbo_decoder_qps, turbo_decoder_time_consume = \
                test_helper.run_model(turbo_multiheaded_model, use_cuda,
                                      num_iter, use_profile=False)
            print(
                f"MultiHeadedAttention \"({batch_size},{seq_length:03})\" ",
                f" {device} Turbo QPS, {turbo_decoder_qps}, time, {turbo_decoder_time_consume}"
            )
            self.assertTrue(
                torch.max(
                    torch.abs(torch_attention_result[0] -
                              turbo_decoder_attn_result[0])) < (
                                  1e-3 if use_cuda else 1e-4))

            with open(fname, "a") as fh:
                fh.write(
                    f"\"({batch_size},{seq_length:03})\", {torch_qps}, {turbo_qps}\n"
                )
예제 #13
0
    def check_torch_and_turbo(self, use_cuda=True):
        self.init_data(use_cuda=use_cuda)
        self.num_iter = 2

        turbo_bert_layer_result = None
        turbo_model = lambda: self.turbo_bert_encoder(self.input_tensor,
                                                      self.attention_mask,
                                                      output_attentions=True,
                                                      output_hidden_states=True
                                                      )

        turbo_bert_layer_result, turbo_qps, turbo_time_consume = \
            test_helper.run_model(turbo_model, use_cuda, self.num_iter)

        print(f"BertEncoder TurboTransform QPS, {turbo_qps}, ",
              f"Time Cost, {turbo_time_consume}")

        # turbo_bert_layer_result = self.turbo_bert_encoder(
        #     self.input_tensor,
        #     self.attention_mask,
        #     output_attentions = True,
        #     output_hidden_states = False)

        torch_model = lambda: self.torch_encoder_layer(
            self.input_tensor,
            self.attention_mask, [None] * self.cfg.num_hidden_layers,
            output_attentions=True,
            output_hidden_states=True)

        torch_bert_layer_result, torch_qps, torch_time_consume = \
            test_helper.run_model(torch_model, use_cuda, self.num_iter)

        print(f"BertEncoder Torch QPS, {torch_qps}, ",
              f"Time Cost, {torch_time_consume}")

        diff = torch.abs(torch_bert_layer_result[0] -
                         turbo_bert_layer_result[0])
        self.assertTrue(torch.max(diff) < 1e-2)

        # Note we did not print the last hidden_states, because it is the same as output
        # print(len(torch_bert_layer_result[1]), len(turbo_bert_layer_result[1]))
        for a, b in zip(torch_bert_layer_result[1],
                        turbo_bert_layer_result[1]):
            diff = torch.abs(a - b)
            self.assertTrue(torch.max(diff) < 1e-2)

        for a, b in zip(torch_bert_layer_result[2],
                        turbo_bert_layer_result[2]):
            diff = torch.abs(a - b)
            self.assertTrue(torch.max(diff) < 1e-2)
예제 #14
0
    def check_torch_and_turbo(self, batch_size, seq_length, use_cuda,
                              use_memory_opt):
        self.init_data(use_cuda=use_cuda)
        self.input_tensor = torch.randint(low=0,
                                          high=self.cfg.vocab_size - 1,
                                          size=(batch_size, seq_length),
                                          device=self.test_device)

        device = "GPU" if use_cuda else "CPU"
        num_iter = 1

        if use_memory_opt:
            turbo_transformers.bert_opt_mem_allocate_api(
                self.input_tensor.size()[0],  # batch
                self.input_tensor.size()[1],  # seq_len
                self.cfg.num_attention_heads,
                self.cfg.hidden_size,
                self.cfg.num_hidden_layers,
                "GPU" if 'cuda' in self.input_tensor.device.type else "CPU")

        turbo_model = lambda: self.turbo_model(
            self.input_tensor, attention_mask=None, head_mask=None)
        turbo_result, turbo_qps, turbo_time = \
            test_helper.run_model(turbo_model, use_cuda, num_iter)

        print(
            f"AlbertLayer \"({batch_size},{seq_length:03})\" ",
            f"{device} TurboTransform QPS,  {turbo_qps}, time, {turbo_time}")
        torch_model = lambda: self.torch_model(
            input_ids=self.input_tensor, attention_mask=None, head_mask=None)
        with turbo_transformers.pref_guard("albert_perf") as perf:
            torch_result, torch_qps, torch_time = \
                test_helper.run_model(torch_model, use_cuda, num_iter)

        print(f"AlbertModel \"({batch_size},{seq_length:03})\" ",
              f"{device} Torch QPS,  {torch_qps}, time, {torch_time}")

        # print(turbo_result[-1])
        # print(turbo_result, torch_result[0])
        # TODO(jiaruifang) Error is too high. Does tensor core introduce more differences?
        tolerate_error = 1e-2
        self.assertTrue(
            torch.max(torch.abs(torch_result[0] -
                                turbo_result[0])) < tolerate_error)

        with open("albert_model_res.txt", "a") as fh:
            fh.write(
                f"\"({batch_size},{seq_length:03})\", {torch_qps}, {torch_qps}\n"
            )
예제 #15
0
    def check_torch_and_turbo(self, use_cuda, use_pooler):
        self.init_data(use_cuda)
        num_iter = 2
        device_name = "GPU" if use_cuda else "CPU"
        input_ids = torch.randint(low=0,
                                  high=self.cfg.vocab_size - 1,
                                  size=(2, 32),
                                  dtype=torch.long,
                                  device=self.test_device)

        torch_model = lambda: self.torch_model(input_ids)
        torch_result, torch_qps, torch_time = \
            test_helper.run_model(torch_model, use_cuda, num_iter)
        print(f'BertModel Plain PyTorch({device_name}) QPS {torch_qps}')

        turbo_model = (
            lambda: self.turbo_pooler_model(input_ids)) if use_pooler else (
                lambda: self.turbo_model(input_ids))
        turbo_result, turbo_qps, turbo_time = \
            test_helper.run_model(turbo_model, use_cuda, num_iter)
        print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}')

        torch_result_final = (torch_result[1]).cpu().numpy(
        ) if use_pooler else torch_result[0][:, 0].cpu().numpy()

        turbo_result_final = turbo_result[0].cpu().numpy(
        ) if use_pooler else turbo_result.cpu().numpy()

        #TODO(jiaruifang, v_cshi) check why pooler introduce more difference
        if use_pooler:
            print(
                "encode output diff: ",
                numpy.max((torch_result[0][:, 0]).cpu().numpy() -
                          turbo_result[1].cpu().numpy()).reshape(-1))
            print(
                "pooler output diff: ",
                numpy.max(
                    (turbo_result_final - torch_result_final).reshape(-1)))
        (atol, rtol) = (1e-2, 1e-2) if use_pooler else (5e-3, 1e-4)

        self.assertTrue(
            numpy.allclose(torch_result_final,
                           turbo_result_final,
                           atol=atol,
                           rtol=rtol))
예제 #16
0
    def check_torch_and_turbo(self,
                              use_cuda,
                              batch_size,
                              seq_len,
                              use_memory_opt=True):
        self.init_data(use_cuda)
        num_iter = 1
        device_name = "GPU" if use_cuda else "CPU"
        input_ids = torch.randint(low=0,
                                  high=self.cfg.vocab_size - 1,
                                  size=(batch_size, seq_len),
                                  dtype=torch.long,
                                  device=self.test_device)

        torch_model = lambda: self.torch_model(input_ids)
        torch_result, torch_qps, torch_time = \
            test_helper.run_model(torch_model, use_cuda, num_iter)
        print(f'BertModel PyTorch({device_name}) QPS {torch_qps}')

        turbo_model = (lambda: self.turbo_model(input_ids))

        if use_memory_opt:
            turbo_transformers.bert_opt_mem_allocate_api(
                input_ids.size()[0],  # batch
                input_ids.size()[1],  # seq_len
                self.cfg.num_attention_heads,
                self.cfg.hidden_size,
                self.cfg.num_hidden_layers,
                "GPU" if 'cuda' in input_ids.device.type else "CPU")

        with turbo_transformers.pref_guard("bert_perf") as perf:
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)
        print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}')



        print(f"batch {batch_size} seq_len {seq_len}")
        print(torch.max(torch_result[0].cpu() - turbo_result[0].cpu()))
        self.assertTrue(
            numpy.allclose(torch_result[0].cpu(),
                           turbo_result[0].cpu(),
                           atol=1e-2,
                           rtol=1e-3))
        def check_torch_and_turbo(self, use_cuda, num_iter=1):
            onmt_ffn, turbo_ffn_trans, turbo_ffn_notrans, inputs = self.init_data(
                use_cuda)
            device = "GPU" if use_cuda else "CPU"
            onmt_model = lambda: onmt_ffn(inputs)
            onmt_model_result, torch_qps, torch_time_consume = \
                test_helper.run_model(onmt_model, use_cuda, num_iter)

            print(
                f"PositionwiseFeedForward \"({batch_size}, {input_len:03})\" ",
                f"{device} ONMT QPS, {torch_qps}, time, {torch_time_consume}")

            turbo_model_trans = lambda: turbo_ffn_trans(inputs,
                                                        is_trans_weight=True)
            with turbo_transformers.pref_guard("gpref_test") as perf:
                turbo_model_result, turbo_qps_trans, turbo_time_consume_trans = \
                    test_helper.run_model(turbo_model_trans, use_cuda, num_iter)

            print(
                f"PositionwiseFeedForward \"({batch_size}, {input_len:03})\" ",
                f"{device} Turbo Trans QPS, {turbo_qps_trans}, time, {turbo_time_consume_trans}"
            )

            turbo_model_notrans = lambda: turbo_ffn_notrans(
                inputs, is_trans_weight=False)
            with turbo_transformers.pref_guard("gpref_test") as perf:
                turbo_model_result, turbo_qps_notrans, turbo_time_consume_notrans = \
                    test_helper.run_model(turbo_model_notrans, use_cuda, num_iter)

            print(
                f"PositionwiseFeedForward Notrans \"({batch_size}, {input_len:03})\" ",
                f"{device} Turbo NoTrans QPS, {turbo_qps_notrans}, time, {turbo_time_consume_notrans}"
            )
            self.assertTrue(
                torch.max(torch.abs(turbo_model_result - onmt_model_result)) <
                (1e-3 if use_cuda else 1e-4))

            with open(fname, "a") as fh:
                fh.write(
                    f"\"({batch_size},{input_len:03})\", {torch_qps}, {turbo_qps_trans}, {turbo_qps_notrans}\n"
                )
        def check_torch_and_turbo(self, use_cuda):
            input_ids, position_ids, token_type_ids = self.init_data(use_cuda)

            device = "GPU" if use_cuda else "CPU"
            num_iter = 100
            torch_model = lambda: self.torch_embedding(
                input_ids, token_type_ids, position_ids)
            torch_result, torch_qps, torch_time = test_helper.run_model(
                torch_model, use_cuda, num_iter)
            print(f"AlbertEmbeddings \"({batch_size},{seq_length:03})\" ",
                  f"{device} Torch QPS,  {torch_qps}, time, {torch_time}")

            turbo_model = lambda: self.turbo_embedding(input_ids, position_ids,
                                                       token_type_ids)
            turbo_result, turbo_qps, turbo_time = test_helper.run_model(
                turbo_model, use_cuda, num_iter)
            print(f"BertEmbeddings \"({batch_size},{seq_length:03})\" ",
                  f"{device} Turbo QPS,  {turbo_qps}, time, {turbo_time}")

            self.assertTrue(
                torch.max(torch.abs(torch_result - turbo_result)) < 1e-5)
예제 #19
0
        def check_torch_and_turbo(self, use_cuda):
            self.init_data(use_cuda=use_cuda)
            device = "GPU" if use_cuda else "CPU"

            num_iter = 2
            hidden_size = self.cfg.hidden_size
            input_tensor = torch.rand(size=(batch_size, 1, hidden_size),
                                      dtype=torch.float32,
                                      device=self.test_device)

            torch_model = lambda: self.torch_pooler(input_tensor)
            torch_result, torch_qps, torch_time = \
                test_helper.run_model(torch_model, use_cuda, num_iter)
            print(f"BertPooler \"({batch_size},{hidden_size:03})\" ",
                  f"{device} Torch QPS,  {torch_qps}, time, {torch_time}")

            turbo_model = lambda: self.turbo_pooler(
                input_tensor.reshape((batch_size, hidden_size)))
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)

            print(
                f"BertPooler \"({batch_size}, {hidden_size}\" ",
                f"{device} TurboTransform QPS,  {turbo_qps}, time, {turbo_time}"
            )

            torch_result = torch_result.cpu().numpy()
            turbo_result = turbo_result.cpu().numpy()

            self.assertTrue(
                numpy.allclose(torch_result,
                               turbo_result,
                               rtol=1e-4,
                               atol=1e-3))

            with open("bert_pooler_res.txt", "a") as fh:
                fh.write(
                    f"\"({batch_size},{hidden_size:03})\", {torch_qps}, {torch_qps}\n"
                )
        def check_torch_and_turbo(self, use_cuda):
            self.init_data(use_cuda)
            num_iter = 2
            device = "GPU" if use_cuda else "CPU"

            torch_model = lambda: self.torch_bertout(self.intermediate_output,
                                                     self.attention_output)
            torch_result, torch_qps, torch_time = \
                test_helper.run_model(torch_model, use_cuda, num_iter)
            print(f'Bert Output Plain PyTorch({device}) QPS {torch_qps}')

            turbo_model = lambda: self.turbo_bertout(self.intermediate_output,
                                                     self.attention_output)
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)
            print(
                f'Bert Output Plain TurboTransformer({device}) QPS {turbo_qps}'
            )

            # cuda version precision is lower due to tensor-core
            self.assertTrue(
                torch.max(torch.abs(torch_result - turbo_result)) < 1e-2
                if use_cuda else 1e-4)
예제 #21
0
        def check_torch_and_turbo(self, use_cuda):
            self.init_data(use_cuda=use_cuda)
            device = "GPU" if use_cuda else "CPU"
            num_iter = 2
            turbo_model = lambda: self.turbo_layer(
                self.input_tensor, self.attention_mask, output_attentions=True)
            turbo_result, turbo_qps, turbo_time = \
                test_helper.run_model(turbo_model, use_cuda, num_iter)

            print(
                f"AlbertLayer \"({batch_size},{seq_length:03})\" ",
                f"{device} TurboTransform QPS,  {turbo_qps}, time, {turbo_time}"
            )

            torch_model = lambda: self.torch_layer(
                self.input_tensor, self.attention_mask, output_attentions=True)
            torch_result, torch_qps, torch_time = \
                test_helper.run_model(torch_model, use_cuda, num_iter)

            print(f"AlbertLayer \"({batch_size},{seq_length:03})\" ",
                  f"{device} Torch QPS,  {torch_qps}, time, {torch_time}")

            # print(turbo_result - torch_result[0])
            # TODO(jiaruifang) Error is too high. Does tensor core introduce more differences?
            cpu_tolerate_error = 1e-5
            gpu_tolerate_error = 1e-3
            self.assertTrue(
                torch.max(torch.abs(torch_result[0] - turbo_result[0])) <
                gpu_tolerate_error if use_cuda else cpu_tolerate_error)
            self.assertTrue(
                torch.max(torch.abs(torch_result[1] - turbo_result[1])) <
                gpu_tolerate_error if use_cuda else cpu_tolerate_error)
            with open("albert_layer_res.txt", "a") as fh:
                fh.write(
                    f"\"({batch_size},{seq_length:03})\", {torch_qps}, {torch_qps}\n"
                )
        def check_torch_and_turbo(self, use_cuda, num_iter=1):
            deivce_type = "GPU" if use_cuda else "CPU"
            info = f"\"({deivce_type}, {batch_size}, {src_length}, {T})\""

            step = 2
            self.init_data(use_cuda=use_cuda)

            self.inputs = torch.rand(batch_size,
                                     T,
                                     self.model_dim,
                                     dtype=torch.float32,
                                     device=self.test_device)
            self.memory_bank = torch.rand(batch_size,
                                          src_length,
                                          self.model_dim,
                                          dtype=torch.float32,
                                          device=self.test_device)

            self.src_pad_mask = torch.zeros(batch_size,
                                            1,
                                            src_length,
                                            dtype=torch.float32,
                                            device=self.test_device).bool()
            self.tgt_pad_mask = torch.zeros(batch_size,
                                            1,
                                            T,
                                            dtype=torch.float32,
                                            device=self.test_device).bool()

            onmt_model = lambda: self.onmt_decoder(self.inputs,
                                                   self.memory_bank,
                                                   self.src_pad_mask,
                                                   self.tgt_pad_mask,
                                                   layer_cache=None,
                                                   step=step,
                                                   future=False)

            onmt_result, torch_qps, torch_time_consume = \
                test_helper.run_model(onmt_model, use_cuda, num_iter)

            onmt_mid, attns, attn_align = onmt_result

            print(
                f"ONMT Deocder {info} ",
                f"{deivce_type} QPS, {torch_qps}, time, {torch_time_consume}")

            if with_quantize_dynamic and not use_cuda:
                quantized_onmt_model = lambda: self.quantized_onmt_decoder(
                    self.inputs,
                    self.memory_bank,
                    self.src_pad_mask,
                    self.tgt_pad_mask,
                    layer_cache=None,
                    step=step,
                    future=False)

                quantized_onmt_result, quantized_torch_qps, quantized_torch_time_consume = \
                    test_helper.run_model(quantized_onmt_model, use_cuda, num_iter)

                quantized_onmt_mid, quantized_attns, quantized_attn_align = quantized_onmt_result

                print(
                    f"ONMT Quantized Deocder {info} ",
                    f"{deivce_type} QPS, {quantized_torch_qps}, time, {quantized_torch_time_consume}"
                )

                # print(onmt_mid)
                # print(quantized_onmt_mid)

                # self.assertTrue(
                #     torch.max(torch.abs(onmt_mid -
                #                         quantized_onmt_mid)) < (1e-3 if use_cuda else 1e-4))
                # self.assertTrue(
                #     torch.max(torch.abs(attns - quantized_attns)) < (
                #         1e-3 if use_cuda else 1e-4))

            turbo_model = lambda: self.turbo_decoder(self.inputs,
                                                     self.memory_bank,
                                                     self.src_pad_mask,
                                                     self.tgt_pad_mask,
                                                     layer_cache=None,
                                                     step=step,
                                                     future=False)

            with turbo_transformers.pref_guard(info) as perf:
                turbo_result, turbo_qps, turbo_time_consume = \
                    test_helper.run_model(turbo_model, use_cuda, num_iter)

            turbo_mid, turbo_attns, _ = turbo_result

            print(
                f"Turbo Deocder {info} ",
                f"{deivce_type} QPS, {turbo_qps}, time, {turbo_time_consume}")

            self.assertTrue(
                torch.max(torch.abs(onmt_mid -
                                    turbo_mid)) < (1e-3 if use_cuda else 1e-4))
            self.assertTrue(
                torch.max(torch.abs(attns - turbo_attns)) < (
                    1e-3 if use_cuda else 1e-4))

            if with_quantize_dynamic and not use_cuda:
                with open(fname, "a") as fh:
                    fh.write(
                        f"{info} {torch_qps}, {quantized_torch_qps}, {turbo_qps}\n"
                    )
            else:
                with open(fname, "a") as fh:
                    fh.write(f"{info} {torch_qps}, {turbo_qps}\n")
        def check_torch_and_turbo(self, use_cuda, num_iter=1):
            onmt_multi_headed_attention, torch_layernorm, turbo_attn_trans, turbo_attn_notrans, Q, K, V = \
                self.init_data(use_cuda)
            device = "GPU" if use_cuda else "CPU"
            info = f"\"({device}, {set_layer_cache}, {pre_layernorm}, {post_add_input}, {attn_type}, {batch_size}, {key_seq_len:03}, {query_seq_len:03})\""

            if attn_type == "context":
                attention_mask = torch.zeros((batch_size, 1, key_seq_len),
                                             dtype=torch.bool,
                                             device=self.test_device)
            elif attn_type == "self":
                attention_mask = None
                # torch.zeros(
                #     (batch_size, query_seq_len, key_seq_len),
                #     dtype=torch.bool,
                #     device=self.test_device)
            else:
                raise "attn type is not supported"

            # set layer_cache
            if set_layer_cache:
                memory_keys = torch.rand(size=(batch_size, self.head_count,
                                               key_seq_len,
                                               self.size_per_head),
                                         dtype=torch.float32,
                                         device=self.test_device)
                memory_values = torch.rand(size=(batch_size, self.head_count,
                                                 key_seq_len,
                                                 self.size_per_head),
                                           dtype=torch.float32,
                                           device=self.test_device)
                self_keys = torch.rand(size=(batch_size, self.head_count,
                                             query_seq_len,
                                             self.size_per_head),
                                       dtype=torch.float32,
                                       device=self.test_device)
                self_values = torch.rand(size=(batch_size, self.head_count,
                                               query_seq_len,
                                               self.size_per_head),
                                         dtype=torch.float32,
                                         device=self.test_device)
                print("self_keys size: ", self_keys.size())
                layer_cache_torch = {
                    "memory_keys": torch.clone(memory_keys),
                    "memory_values": torch.clone(memory_values),
                    "self_keys": torch.clone(self_keys),
                    "self_values": torch.clone(self_values)
                }
            else:
                layer_cache_torch = {
                    "memory_keys": None,
                    "memory_values": None,
                    "self_keys": None,
                    "self_values": None
                }

            onmt_model = lambda: onmt_multi_headed_attention(
                K,
                V,
                torch.clone(torch_layernorm(Q)) if pre_layernorm else Q,
                mask=attention_mask,
                layer_cache=layer_cache_torch,
                attn_type=attn_type)

            onmt_multi_headed_attention_result, torch_qps, torch_time_consume = \
                test_helper.run_model(onmt_model, use_cuda, num_iter) # return output, attns

            onmt_attns = onmt_multi_headed_attention_result[1]
            if post_add_input:
                onmt_output = onmt_multi_headed_attention_result[0] + Q
            else:
                onmt_output = onmt_multi_headed_attention_result[0]
            print(
                f"Multi Headed Attention {info} ONMT, QPS,{torch_qps}, time, {torch_time_consume}"
            )

            if with_quantize_dynamic and not use_cuda:
                q_onmt_model = lambda: self.q_onmt_multi_headed_attention(
                    K,
                    V,
                    torch.clone(torch_layernorm(Q)) if pre_layernorm else Q,
                    mask=attention_mask,
                    layer_cache=layer_cache_torch,
                    attn_type=attn_type)

                q_onmt_multi_headed_attention_result, q_torch_qps, q_torch_time_consume = \
                    test_helper.run_model(q_onmt_model, use_cuda, num_iter) # return output, attns
                onmt_attns = q_onmt_multi_headed_attention_result[1]
                if post_add_input:
                    onmt_output = q_onmt_multi_headed_attention_result[0] + Q
                else:
                    onmt_output = q_onmt_multi_headed_attention_result[0]

                print(
                    f"Multi Headed Attention {info} Q-ONMT, QPS, {q_torch_qps}, time, {q_torch_time_consume}"
                )

            # benchmarking turbo with weight transposed
            turbo_attention_mask = attention_mask.float(
            ) * -1e18 if attention_mask is not None else None

            if set_layer_cache:
                layer_cache_turbo = {
                    "memory_keys": torch.clone(memory_keys),
                    "memory_values": torch.clone(memory_values),
                    "self_keys": torch.clone(self_keys),
                    "self_values": torch.clone(self_values)
                }
            else:
                layer_cache_turbo = {
                    "memory_keys": None,
                    "memory_values": None,
                    "self_keys": None,
                    "self_values": None
                }

            turbo_model_trans = lambda: turbo_attn_trans(
                K,
                V,
                Q,
                turbo_attention_mask,
                layer_cache=layer_cache_turbo,
                attn_type=attn_type,
                pre_layernorm=pre_layernorm,
                post_add_input=post_add_input,
                is_trans_weight=True)

            # with turbo_transformers.pref_guard("pref_test") as perf:
            turbo_result, turbo_qps, turbo_time_consume = \
                test_helper.run_model(turbo_model_trans, use_cuda,
                                    num_iter)

            turbo_output_trans, turbo_attns_trans = turbo_result
            print(
                f"Multi Headed Attention {info} Turbo Trans, QPS, {turbo_qps}, time, {turbo_time_consume}"
            )
            self.assertTrue(
                torch.max(torch.abs(onmt_output - turbo_output_trans)) < (
                    1e-3 if use_cuda else 1e-4))
            self.assertTrue(
                torch.max(torch.abs(onmt_attns - turbo_attns_trans)) < (
                    1e-3 if use_cuda else 1e-4))

            if layer_cache_torch is not None:
                for k, v in layer_cache_torch.items():
                    if v is not None:
                        self.assertTrue(
                            torch.max(torch.abs(layer_cache_turbo[k] -
                                                v)) < 1e-3)

            # benchmarking turbo with weight not transposed
            if set_layer_cache:
                layer_cache_turbo = {
                    "memory_keys": torch.clone(memory_keys),
                    "memory_values": torch.clone(memory_values),
                    "self_keys": torch.clone(self_keys),
                    "self_values": torch.clone(self_values)
                }
            else:
                layer_cache_turbo = {
                    "memory_keys": None,
                    "memory_values": None,
                    "self_keys": None,
                    "self_values": None
                }

            turbo_model_notrans = lambda: turbo_attn_notrans(
                K,
                V,
                Q,
                turbo_attention_mask,
                layer_cache=layer_cache_turbo,
                attn_type=attn_type,
                pre_layernorm=pre_layernorm,
                post_add_input=post_add_input,
                is_trans_weight=False)

            with turbo_transformers.pref_guard("pref_test") as perf:
                turbo_result, turbo_qps, turbo_time_consume_notrans = \
                    test_helper.run_model(turbo_model_notrans, use_cuda,
                                        num_iter)

            turbo_output_notrans, turbo_attns_notrans = turbo_result

            print(
                f"Multi Headed Attention {info} Turbo NoTrans, QPS,{turbo_qps}, time, {turbo_time_consume_notrans}"
            )

            self.assertTrue(
                torch.max(torch.abs(onmt_output - turbo_output_notrans)) < (
                    1e-3 if use_cuda else 1e-4))
            self.assertTrue(
                torch.max(torch.abs(onmt_attns - turbo_attns_notrans)) < (
                    1e-3 if use_cuda else 1e-4))

            if with_quantize_dynamic and not use_cuda:
                with open(fname, "a") as fh:
                    fh.write(
                        f"{info} {torch_qps}, {q_torch_qps}, {turbo_qps}\n")
            else:
                with open(fname, "a") as fh:
                    fh.write(f"{info} {torch_qps}, {turbo_qps}\n")