def init_data(self, use_cuda):
            test_device = torch.device('cuda:0') if use_cuda else \
                   torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)
            self.cfg.output_attentions = True
            torch_attention = BertAttention(self.cfg)
            torch_attention.eval()
            if use_cuda:
                torch_attention.to(test_device)

            # Get FT Attention
            turbo_attention = turbo_transformers.BertAttention.from_torch(
                torch_attention)

            turbo_decoder_attention = turbo_transformers.MultiHeadedAttention.from_torch(
                torch_attention, is_trans_weight=False)

            hidden_size = self.cfg.hidden_size
            input_tensor = torch.rand(size=(batch_size, seq_length,
                                            hidden_size),
                                      dtype=torch.float32,
                                      device=test_device)
            attention_mask = torch.ones((batch_size, seq_length),
                                        dtype=torch.float32,
                                        device=test_device)
            attention_mask = attention_mask[:, None, None, :]
            attention_mask = (1.0 - attention_mask) * -10000.0
            return torch_attention, turbo_attention, turbo_decoder_attention, input_tensor, attention_mask
Exemplo n.º 2
0
        def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0,
                                        hidden_dropout_prob=0.0)

            torch.set_grad_enabled(False)
            self.torch_ffn = DistilFFN(self.cfg)
            self.torch_ffn.eval()
            self.output_layer_norm = torch.nn.LayerNorm(
                normalized_shape=self.cfg.dim, eps=1e-12)
            if use_cuda:
                self.torch_ffn.to(self.test_device)
                self.output_layer_norm.to(self.test_device)

            self.turbo_ffn = turbo_transformers.DistrillFFN.from_torch(
                self.torch_ffn, self.output_layer_norm)
            # (batch_size, input_len, model_dim)
            self.inputs = torch.rand(size=(batch_size, input_len,
                                           self.cfg.dim),
                                     dtype=torch.float32,
                                     device=self.test_device)

            print(self.cfg.activation)
Exemplo n.º 3
0
def benchmark_turbo_transformers(model_name: str, seq_len: int,
                                 batch_size: int, n: int, num_threads: int):
    import torch
    import transformers
    import contexttimer
    import turbo_transformers
    import cProfile
    import benchmark_helper
    turbo_transformers.set_num_threads(num_threads)

    if model_name == "bert":
        cfg = transformers.BertConfig()
        model = transformers.BertModel(cfg)
        model.eval()
        model = turbo_transformers.BertModel.from_torch(model)
    elif model_name == "albert":
        cfg = transformers.AlbertConfig()
        model = transformers.AlbertModel(cfg)
        model.eval()
        model = turbo_transformers.AlbertModel.from_torch(model)
    elif model_name == "roberta":
        cfg = transformers.RobertaConfig()
        model = transformers.RobertaModel(cfg)
        model.eval()
        model = turbo_transformers.RobertaModel.from_torch(model)
    else:
        raise (f"benchmark does not support {model_name}")

    cfg = model.config  # type: transformers.BertConfig
    input_ids = torch.randint(low=0,
                              high=cfg.vocab_size - 1,
                              size=(batch_size, seq_len),
                              dtype=torch.long)
    benchmark_helper.run_model(lambda: model(input_ids), False, n, batch_size,
                               seq_len, "turbo", num_threads)
        def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0,
                                        hidden_dropout_prob=0.0)

            torch.set_grad_enabled(False)
            self.torch_transformer_block = DistilTransformerBlock(self.cfg)
            self.torch_transformer_block.eval()
            if use_cuda:
                self.torch_transformer_block.to(self.test_device)

            self.turbo_transformer_block = turbo_transformers.DistrillTransformerBlock.from_torch(
                self.torch_transformer_block)
            # (batch_size, input_len, model_dim)
            self.attention_mask = torch.ones((batch_size, input_len),
                                             dtype=torch.float32,
                                             device=self.test_device)

            self.inputs = torch.rand(size=(batch_size, input_len,
                                           self.cfg.dim),
                                     dtype=torch.float32,
                                     device=self.test_device)
Exemplo n.º 5
0
        def init_data(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.cfg = AlbertConfig()

            self.torch_layer = AlbertLayer(self.cfg)
            if torch.cuda.is_available():
                self.torch_layer.to(self.test_device)
            self.torch_layer.eval()
            self.hidden_size = self.cfg.hidden_size
            self.input_tensor = torch.rand(size=(batch_size, seq_length,
                                                 self.hidden_size),
                                           dtype=torch.float32,
                                           device=self.test_device)

            self.attention_mask = torch.ones((batch_size, seq_length),
                                             dtype=torch.float32,
                                             device=self.test_device)
            self.attention_mask = self.attention_mask[:, None, None, :]
            self.attention_mask = (1.0 - self.attention_mask) * -10000.0

            self.turbo_layer = turbo_transformers.AlbertLayer.from_torch(
                self.torch_layer)
Exemplo n.º 6
0
        def init_data(self, use_cuda):
            test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0,
                                        hidden_dropout_prob=0.0)
            self.cfg.output_attentions = True
            self.torch_attention = DistilAttention(self.cfg)
            self.torch_sa_layer_norm = nn.LayerNorm(
                normalized_shape=self.cfg.dim, eps=1e-12)
            self.torch_attention.eval()
            self.torch_sa_layer_norm.eval()
            if use_cuda:
                self.torch_attention.to(test_device)
                self.torch_sa_layer_norm.to(test_device)

            # Get FT Attention
            self.turbo_attention = turbo_transformers.DistillBertAttention.from_torch(
                self.torch_attention, self.torch_sa_layer_norm)

            hidden_size = self.cfg.hidden_size
            self.input_tensor = torch.rand(size=(batch_size, seq_length,
                                                 hidden_size),
                                           dtype=torch.float32,
                                           device=test_device)
            # NOTE, the mask of distilled attention is different from huggingface bert attention.
            self.attention_mask = torch.ones((batch_size, seq_length),
                                             dtype=torch.float32,
                                             device=test_device)
        def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                   torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            self.model_dim = 1024
            self.d_ff = 4096

            torch.set_grad_enabled(False)
            onmt_ffn = PositionwiseFeedForward(self.model_dim, self.d_ff)
            onmt_ffn.eval()
            if use_cuda:
                onmt_ffn.to(self.test_device)

            turbo_ffn_trans = turbo_transformers.PositionwiseFeedForward.from_onmt(
                onmt_ffn, is_trans_weight=True)
            turbo_ffn_notrans = turbo_transformers.PositionwiseFeedForward.from_onmt(
                onmt_ffn, is_trans_weight=False)
            # (batch_size, input_len, model_dim)
            inputs = torch.rand(size=(batch_size, input_len, self.model_dim),
                                dtype=torch.float32,
                                device=self.test_device)
            return onmt_ffn, turbo_ffn_trans, turbo_ffn_notrans, inputs
Exemplo n.º 8
0
        def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            self.cfg = DistilBertConfig(attention_probs_dropout_prob=0.0,
                                        hidden_dropout_prob=0.0)

            torch.set_grad_enabled(False)
            self.torch_model = DistilBertModel(self.cfg)
            self.torch_model.eval()
            if use_cuda:
                self.torch_model.to(self.test_device)

            self.turbo_transformer = turbo_transformers.DistilBertModel.from_torch(
                self.torch_model)
            # (batch_size, input_len, model_dim)
            self.inputs = torch.randint(low=0,
                                        high=self.cfg.vocab_size - 1,
                                        size=(batch_size, input_len),
                                        dtype=torch.long,
                                        device=self.test_device)
            self.attention_mask = torch.ones((batch_size, input_len),
                                             dtype=torch.long,
                                             device=self.test_device)
            self.head_mask = [None] * self.cfg.num_hidden_layers
def benchmark_turbo_transformers(model_name: str, seq_len: int,
                                 batch_size: int, n: int, enable_random: bool,
                                 max_seq_len: int, min_seq_len: int,
                                 num_threads: int, use_gpu: bool,
                                 enable_mem_opt: bool):
    import torch
    import transformers
    import turbo_transformers
    import benchmark_helper
    test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0')
    cfg = None
    torch.set_grad_enabled(False)
    if model_name == "bert":
        cfg = transformers.BertConfig()
        model = transformers.BertModel(cfg)
        model.to(test_device)
        model.eval()
        model = turbo_transformers.BertModel.from_torch(model, backend="turbo")
    elif model_name == "albert":
        cfg = transformers.AlbertConfig(hidden_size=768,
                                        num_attention_heads=12,
                                        intermediate_size=3072)
        model = transformers.AlbertModel(cfg)
        model.to(test_device)
        model.eval()
        model = turbo_transformers.AlbertModel.from_torch(model)
    elif model_name == "roberta":
        cfg = transformers.RobertaConfig()
        model = transformers.RobertaModel(cfg)
        model.to(test_device)
        model.eval()
        model = turbo_transformers.RobertaModel.from_torch(model)
    elif model_name == "distilbert":
        cfg = transformers.DistilBertConfig()
        model = transformers.DistilBertModel(cfg)
        model.to(test_device)
        model.eval()
        model = turbo_transformers.DistilBertModel.from_torch(model)
    else:
        raise (f"benchmark does not support {model_name}")

    turbo_transformers.set_num_threads(num_threads)
    if enable_random:
        if enable_mem_opt:
            turbo_transformers.reset_allocator_schema("model-aware")
        benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len,
                                            min_seq_len, "turbo", num_threads,
                                            cfg, enable_mem_opt, model_name)
        if enable_mem_opt:
            turbo_transformers.reset_allocator_schema("naive")
    else:
        input_ids = torch.randint(low=0,
                                  high=cfg.vocab_size - 1,
                                  size=(batch_size, seq_len),
                                  dtype=torch.long,
                                  device=test_device)
        benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n,
                                   batch_size, seq_len, "turbo", num_threads,
                                   enable_mem_opt, model_name)
        def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.head_count = 16
            self.model_dim = 1024  #self.model_dim should % self.head_count = 0
            self.size_per_head = int(self.model_dim / self.head_count)

            self.query_seq_len_list = query_seq_len_list
            self.key_seq_len_list = key_seq_len_list
            # build the torch model
            self.model = MultiHeadedAttention(self.head_count, self.model_dim)
            self.model.eval()

            if use_cuda:
                self.model.to(self.test_device)

            # prepare torch input data
            self.Q_list = []
            for query_seq_len in query_seq_len_list:
                Q = torch.rand(
                    size=(
                        1,
                        query_seq_len,  #from_seq
                        self.model_dim),
                    dtype=torch.float32,
                    device=self.test_device)
                self.Q_list.append(Q)

            self.K_list = []
            self.V_list = []
            for key_seq_len in key_seq_len_list:
                K = torch.rand(
                    size=(
                        1,
                        key_seq_len,  #from_seq
                        self.model_dim),
                    dtype=torch.float32,
                    device=self.test_device)

                V = torch.rand(
                    size=(
                        1,
                        key_seq_len,  #to_seq
                        self.model_dim),
                    dtype=torch.float32,
                    device=self.test_device)
                self.K_list.append(K)
                self.V_list.append(V)

            # prepare turbo smart batch model
            self.turbo_smart_pad = turbo_transformers.MultiHeadedAttentionSmartBatch.from_onmt(
                self.model)
        def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                   torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.head_count = 16
            self.model_dim = 1024  #self.model_dim should % self.head_count = 0
            self.size_per_head = int(self.model_dim / self.head_count)

            onmt_multi_headed_attention = MultiHeadedAttention(
                self.head_count, self.model_dim)
            onmt_multi_headed_attention.eval()
            torch_layernorm = torch.nn.LayerNorm(self.model_dim, eps=1e-6)
            torch_layernorm.eval()

            if use_cuda:
                onmt_multi_headed_attention.to(self.test_device)
                torch_layernorm.to(self.test_device)

            K = torch.rand(
                size=(
                    batch_size,
                    key_seq_len,  #from_seq
                    self.model_dim),
                dtype=torch.float32,
                device=self.test_device)
            V = torch.rand(size=(batch_size, key_seq_len, self.model_dim),
                           dtype=torch.float32,
                           device=self.test_device)
            Q = torch.rand(
                size=(
                    batch_size,
                    query_seq_len,  #to_seq
                    self.model_dim),
                dtype=torch.float32,
                device=self.test_device)

            turbo_attn_trans = turbo_transformers.MultiHeadedAttention.from_onmt(
                onmt_multi_headed_attention,
                torch_layernorm,
                is_trans_weight=True)
            turbo_attn_notrans = turbo_transformers.MultiHeadedAttention.from_onmt(
                onmt_multi_headed_attention,
                torch_layernorm,
                is_trans_weight=False)

            if with_quantize_dynamic and not use_cuda:
                self.q_onmt_multi_headed_attention = torch.quantization.quantize_dynamic(
                    onmt_multi_headed_attention)
            return onmt_multi_headed_attention, torch_layernorm, turbo_attn_trans, turbo_attn_notrans, Q, K, V
Exemplo n.º 12
0
def test_smart_batch(use_cuda: bool):
    test_device = torch.device('cuda:0') if use_cuda else \
        torch.device('cpu:0')
    cfg = transformers.BertConfig(attention_probs_dropout_prob=0.0,
                                  hidden_dropout_prob=0.0)
    torch_model = transformers.BertModel(cfg)

    # model_id = "bert-base-uncased"
    # torch_model = transformers.BertModel.from_pretrained(model_id)
    torch_model.eval()
    torch_model.to(test_device)
    torch.set_grad_enabled(False)

    cfg = torch_model.config
    # use 4 threads for computing
    if not use_cuda:
        turbo_transformers.set_num_threads(4)

    # Initialize a turbo BertModel with smart batching from torch model.
    turbo_model = turbo_transformers.BertModelSmartBatch.from_torch(
        torch_model)

    # a batch of queries with different lengths.
    query_seq_len_list = [18, 2, 3, 51]
    input_list = []

    # generate random inputs. Of course you can use real data.
    for query_seq_len in query_seq_len_list:
        input_seq = torch.randint(low=0,
                                  high=cfg.vocab_size - 1,
                                  size=(1, query_seq_len),
                                  dtype=torch.long,
                                  device=test_device)
        input_list.append(input_seq)

    # start inference
    s_res = serial_bert_inference(torch_model, input_list)
    b_res = batch_bert_inference(turbo_model, input_list, query_seq_len_list)
    print(torch.max(torch.abs(b_res - s_res)))
    assert (torch.max(torch.abs(b_res - s_res)) < 1e-2)

    start_time = time.time()
    for i in range(10):
        serial_bert_inference(torch_model, input_list)
    end_time = time.time()
    print("\ntorch time consum: {}".format(end_time - start_time))

    start_time = time.time()
    for i in range(10):
        batch_bert_inference(turbo_model, input_list, query_seq_len_list)
    end_time = time.time()
    print("\nturbo time consum: {}".format(end_time - start_time))
def benchmark_turbo_transformers(model_name: str, seq_len: int,
                                 batch_size: int, n: int, enable_random: bool,
                                 max_seq_len: int, min_seq_len: int,
                                 num_threads: int, use_gpu: bool):
    import torch
    import transformers
    import contexttimer
    import turbo_transformers
    import benchmark_helper
    test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0')
    if use_gpu:
        print("using GPU")
    else:
        print("using CPU")
    cfg = None
    torch.set_grad_enabled(False)
    if model_name == "bert":
        cfg = transformers.BertConfig()
        model = transformers.BertModel(cfg)
        model.to(test_device)
        model.eval()
        model = turbo_transformers.BertModel.from_torch(model)
    elif model_name == "albert":
        cfg = transformers.AlbertConfig()
        model = transformers.AlbertModel(cfg)
        model.to(test_device)
        model.eval()
        model = turbo_transformers.AlbertModel.from_torch(model)
    elif model_name == "roberta":
        cfg = transformers.RobertaConfig()
        model = transformers.RobertaModel(cfg)
        model.to(test_device)
        model.eval()
        model = turbo_transformers.RobertaModel.from_torch(model)
    else:
        raise (f"benchmark does not support {model_name}")

    turbo_transformers.set_num_threads(num_threads)
    if enable_random:
        benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len,
                                            min_seq_len, "turbo", num_threads,
                                            cfg)
    else:
        input_ids = torch.randint(low=0,
                                  high=cfg.vocab_size - 1,
                                  size=(batch_size, seq_len),
                                  dtype=torch.long,
                                  device=test_device)

        benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n,
                                   batch_size, seq_len, "turbo", num_threads)
Exemplo n.º 14
0
    def init_data(self, use_cuda) -> None:
        torch.set_grad_enabled(False)
        torch.set_num_threads(4)
        turbo_transformers.set_num_threads(4)
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')

        self.cfg = BertConfig()
        self.torch_model = BertModel(self.cfg)
        self.torch_model.eval()

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)

        self.turbo_model = turbo_transformers.BertModel.from_torch(
            self.torch_model, self.test_device, "turbo")
Exemplo n.º 15
0
def test(loadtype: LoadType):
    # use 4 threads for computing
    turbo_transformers.set_num_threads(4)
    model_id = "bert-base-uncased"
    model = transformers.BertModel.from_pretrained(model_id)
    model.eval()
    cfg = model.config

    input_ids = torch.tensor(
        ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]),
        dtype=torch.long)
    position_ids = torch.tensor(([1, 0, 0, 0], [1, 1, 1, 0]), dtype=torch.long)
    segment_ids = torch.tensor(([1, 1, 1, 0], [1, 0, 0, 0]), dtype=torch.long)
    torch.set_grad_enabled(False)
    torch_res = model(
        input_ids, position_ids=position_ids, token_type_ids=segment_ids
    )  # sequence_output, pooled_output, (hidden_states), (attentions)
    print("torch bert sequence output: ",
          torch_res[0][:, 0, :])  #get the first sequence
    print("torch bert pooler output: ", torch_res[1])  # pooled_output

    # there are three ways to load pretrained model.
    if loadtype is LoadType.PYTORCH:
        # 1, from a PyTorch model, which has loaded a pretrained model
        tt_model = turbo_transformers.BertModel.from_torch(model)
    elif loadtype is LoadType.PRETRAINED:
        # 2. directly load from checkpoint (torch saved model)
        tt_model = turbo_transformers.BertModel.from_pretrained(model_id)
    elif loadtype is LoadType.NPZ:
        # 3. load model from npz
        if len(sys.argv) == 2:
            try:
                print(sys.argv[1])
                in_file = sys.argv[1]
            except:
                sys.exit("ERROR. can not open ", sys.argv[1])
        else:
            in_file = "/workspace/bert_torch.npz"
        tt_model = turbo_transformers.BertModel.from_npz(in_file, cfg)
    else:
        raise ("LoadType is not supported")
    res = tt_model(
        input_ids, position_ids=position_ids,
        token_type_ids=segment_ids)  # sequence_output, pooled_output
    print("turbo bert sequence output:", res[0], res[0].size())
    print("turbo bert pooler output: ", res[1])  # pooled_output
        def init_data(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.cfg = BertConfig()

            self.torch_intermediate = BertIntermediate(self.cfg)
            if torch.cuda.is_available():
                self.torch_intermediate.to(self.test_device)
            self.torch_intermediate.eval()

            self.turbo_intermediate = turbo_transformers.BertIntermediate.from_torch(
                self.torch_intermediate)
Exemplo n.º 17
0
def test(use_cuda: bool):
    test_device_name = "GPU" if use_cuda else "CPU"

    test_device = torch.device('cuda:0') if use_cuda else \
        torch.device('cpu:0')

    cfg = transformers.BertConfig()
    # use 4 threads for computing
    turbo_transformers.set_num_threads(4)

    input_ids = np.array(
        ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]),
        dtype=np.int64)
    segment_ids = np.array(([1, 1, 1, 0], [1, 0, 0, 0]), dtype=np.int64)

    input_ids_tensor = turbo_transformers.nparray2tensor(
        input_ids, test_device_name)
    segment_ids_tensor = turbo_transformers.nparray2tensor(
        segment_ids, test_device_name)
    # 3. load model from npz
    if len(sys.argv) == 2:
        try:
            print(sys.argv[1])
            in_file = sys.argv[1]
        except:
            sys.exit("ERROR. can not open ", sys.argv[1])
    else:
        in_file = "/home/jiaruifang/codes/TurboTransformers/bert.npz"
    # 255 MiB

    tt_model = turbo_transformers.BertModel.from_npz(in_file, cfg, test_device)

    # 1169 MiB
    start_time = time.time()
    for _ in range(10):
        res = tt_model(input_ids_tensor,
                       token_type_ids=segment_ids_tensor,
                       return_type=turbo_transformers.ReturnType.NUMPY
                       )  # sequence_output, pooled_output
    end_time = time.time()

    print("turbo bert sequence output:", res[0][:, 0, :])
    print("turbo bert pooler output: ", res[1])  # pooled_output
    print("\nturbo time consum: {}".format(end_time - start_time))
Exemplo n.º 18
0
    def init_data(self, use_cuda: bool) -> None:
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')
        if not use_cuda:
            torch.set_num_threads(4)
            turbo_transformers.set_num_threads(4)

        torch.set_grad_enabled(False)
        self.cfg = AlbertConfig(hidden_size=768,
                                num_attention_heads=12,
                                intermediate_size=3072)
        self.torch_model = AlbertModel(self.cfg)

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)
        self.torch_model.eval()
        self.hidden_size = self.cfg.hidden_size

        self.turbo_model = turbo_transformers.AlbertModel.from_torch(
            self.torch_model)
Exemplo n.º 19
0
def test(loadtype: LoadType, use_cuda: bool):
    cfg = transformers.AlbertConfig(hidden_size=768,
                                    num_attention_heads=12,
                                    intermediate_size=3072)
    model = transformers.AlbertModel(cfg)
    model.eval()
    torch.set_grad_enabled(False)

    test_device = torch.device('cuda:0') if use_cuda else \
        torch.device('cpu:0')

    cfg = model.config
    # use 4 threads for computing
    turbo_transformers.set_num_threads(4)

    input_ids = torch.tensor(
        ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]),
        dtype=torch.long)
    model.to(test_device)
    start_time = time.time()
    for _ in range(10):
        torch_res = model(input_ids)
    end_time = time.time()
    print("\ntorch time consum: {}".format(end_time - start_time))

    # there are three ways to load pretrained model.
    if loadtype is LoadType.PYTORCH:
        # 1, from a PyTorch model, which has loaded a pretrained model
        tt_model = turbo_transformers.AlbertModel.from_torch(model)
    else:
        raise ("LoadType is not supported")

    start_time = time.time()
    for _ in range(10):
        res = tt_model(input_ids)  # sequence_output, pooled_output
    end_time = time.time()

    print("\nturbo time consum: {}".format(end_time - start_time))
    assert (numpy.max(
        numpy.abs(res[0].cpu().numpy() - torch_res[0].cpu().numpy())) < 0.1)
Exemplo n.º 20
0
        def init_data(self, use_cuda: bool) -> None:
            self.test_device = torch.device('cuda:0') if use_cuda else \
                torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.cfg = AlbertConfig()

            self.torch_model = AlbertModel(self.cfg)
            if torch.cuda.is_available():
                self.torch_model.to(self.test_device)
            self.torch_model.eval()
            self.hidden_size = self.cfg.hidden_size
            self.input_tensor = torch.randint(low=0,
                                              high=self.cfg.vocab_size - 1,
                                              size=(batch_size, seq_length),
                                              device=self.test_device)

            self.turbo_model = turbo_transformers.AlbertModel.from_torch(
                self.torch_model)
Exemplo n.º 21
0
def benchmark_turbo_transformers(model: str, seq_len: int, batch_size: int,
                                 n: int, num_threads: int):
    import torch
    import transformers
    import contexttimer
    import turbo_transformers
    import cProfile
    import benchmark_helper
    turbo_transformers.set_num_threads(num_threads)

    model_id = "bert-base-uncased"
    model = transformers.BertModel.from_pretrained(
        model_id)  # type: transformers.BertModel
    model.eval()

    cfg = model.config  # type: transformers.BertConfig
    input_ids = torch.randint(low=0,
                              high=cfg.vocab_size - 1,
                              size=(batch_size, seq_len),
                              dtype=torch.long)
    model = turbo_transformers.BertModel.from_torch(model)
    benchmark_helper.run_model(lambda: model(input_ids), False, n, batch_size,
                               seq_len, "turbo", num_threads)
Exemplo n.º 22
0
def test(use_cuda):
    torch.set_grad_enabled(False)
    torch.set_num_threads(4)
    turbo_transformers.set_num_threads(4)

    test_device = torch.device('cuda:0') if use_cuda else \
        torch.device('cpu:0')

    cfg = RobertaConfig()
    torch_model = RobertaModel(cfg)
    torch_model.eval()

    if torch.cuda.is_available():
        torch_model.to(test_device)

    turbo_model = turbo_transformers.RobertaModel.from_torch(
        torch_model, test_device)

    input_ids = torch.randint(low=0,
                              high=cfg.vocab_size - 1,
                              size=(1, 10),
                              dtype=torch.long,
                              device=test_device)

    torch_result = torch_model(input_ids)
    torch_result_final = torch_result[0][:, 0].cpu().numpy()

    turbo_result = turbo_model(input_ids)
    turbo_result_final = turbo_result[0].cpu().numpy()

    # See the differences
    # print(numpy.size(torch_result_final), numpy.size(turbo_result_final))
    # print(torch_result_final - turbo_result_final)
    assert (numpy.allclose(torch_result_final,
                           turbo_result_final,
                           atol=1e-3,
                           rtol=1e-3))
        def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                   torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.model_dim = 1024
            self.onmt_encoder = TransformerEncoderLayer(d_model=self.model_dim,
                                                        heads=8,
                                                        d_ff=1024,
                                                        dropout=0.,
                                                        attention_dropout=0.)
            self.onmt_encoder.eval()
            if use_cuda:
                self.onmt_encoder.to(self.test_device)
            self.turbo_encoder = turbo_transformers.TransformerEncoderLayer.from_onmt(
                self.onmt_encoder)

            # https://pytorch.org/docs/stable/quantization.html
            if with_quantize_dynamic and not use_cuda:
                self.quantized_onmt_encoder = torch.quantization.quantize_dynamic(
                    self.onmt_encoder)
Exemplo n.º 24
0
    def __init__(self,
                 model,
                 fields,
                 src_reader,
                 tgt_reader,
                 gpu=-1,
                 n_best=1,
                 min_length=0,
                 max_length=100,
                 ratio=0.,
                 beam_size=30,
                 random_sampling_topk=1,
                 random_sampling_temp=1,
                 stepwise_penalty=None,
                 dump_beam=False,
                 block_ngram_repeat=0,
                 ignore_when_blocking=frozenset(),
                 replace_unk=False,
                 phrase_table="",
                 data_type="text",
                 verbose=False,
                 report_time=False,
                 copy_attn=False,
                 global_scorer=None,
                 out_file=None,
                 report_align=False,
                 report_score=True,
                 logger=None,
                 seed=-1):
        self.model = model
        # fjr add turbo
        turbo_transformers.set_num_threads(4)
        self.turbo_decoder = turbo_transformers.TransformerDecoder.from_onmt(
            self.model.decoder)

        self.fields = fields
        tgt_field = dict(self.fields)["tgt"].base_field
        self._tgt_vocab = tgt_field.vocab
        self._tgt_eos_idx = self._tgt_vocab.stoi[tgt_field.eos_token]
        self._tgt_pad_idx = self._tgt_vocab.stoi[tgt_field.pad_token]
        self._tgt_bos_idx = self._tgt_vocab.stoi[tgt_field.init_token]
        self._tgt_unk_idx = self._tgt_vocab.stoi[tgt_field.unk_token]
        self._tgt_vocab_len = len(self._tgt_vocab)

        self._gpu = gpu
        self._use_cuda = gpu > -1
        self._dev = torch.device("cuda", self._gpu) \
            if self._use_cuda else torch.device("cpu")

        self.n_best = n_best
        self.max_length = max_length

        self.beam_size = beam_size
        self.random_sampling_temp = random_sampling_temp
        self.sample_from_topk = random_sampling_topk

        self.min_length = min_length
        self.ratio = ratio
        self.stepwise_penalty = stepwise_penalty
        self.dump_beam = dump_beam
        self.block_ngram_repeat = block_ngram_repeat
        self.ignore_when_blocking = ignore_when_blocking
        self._exclusion_idxs = {
            self._tgt_vocab.stoi[t]
            for t in self.ignore_when_blocking
        }
        self.src_reader = src_reader
        self.tgt_reader = tgt_reader
        self.replace_unk = replace_unk
        if self.replace_unk and not self.model.decoder.attentional:
            raise ValueError("replace_unk requires an attentional decoder.")
        self.phrase_table = phrase_table
        self.data_type = data_type
        self.verbose = verbose
        self.report_time = report_time

        self.copy_attn = copy_attn

        self.global_scorer = global_scorer
        if self.global_scorer.has_cov_pen and \
                not self.model.decoder.attentional:
            raise ValueError(
                "Coverage penalty requires an attentional decoder.")
        self.out_file = out_file
        self.report_align = report_align
        self.report_score = report_score
        self.logger = logger

        self.use_filter_pred = False
        self._filter_pred = None

        # for debugging
        self.beam_trace = self.dump_beam != ""
        self.beam_accum = None
        if self.beam_trace:
            self.beam_accum = {
                "predicted_ids": [],
                "beam_parent_ids": [],
                "scores": [],
                "log_probs": []
            }

        set_random_seed(seed, self._use_cuda)
Exemplo n.º 25
0
def test(loadtype: LoadType, use_cuda: bool):
    test_device = torch.device('cuda:0') if use_cuda else \
        torch.device('cpu:0')
    model_id = "bert-base-uncased"
    model = transformers.BertModel.from_pretrained(model_id)
    model.eval()
    model.to(test_device)
    torch.set_grad_enabled(False)

    cfg = model.config
    # use 4 threads for computing
    turbo_transformers.set_num_threads(4)

    input_ids = torch.tensor(
        ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]),
        dtype=torch.long,
        device=test_device)
    # position_ids = torch.tensor(([1, 0, 0, 0], [1, 1, 1, 0]), dtype=torch.long, device = test_device)
    segment_ids = torch.tensor(([1, 1, 1, 0], [1, 0, 0, 0]),
                               dtype=torch.long,
                               device=test_device)

    start_time = time.time()
    for _ in range(10):
        torch_res = model(
            input_ids, token_type_ids=segment_ids
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
    end_time = time.time()
    print("\ntorch time consum: {}".format(end_time - start_time))
    print("torch bert sequence output: ",
          torch_res[0][:, 0, :])  #get the first sequence
    print("torch bert pooler output: ", torch_res[1])  # pooled_output

    # there are three ways to load pretrained model.
    if loadtype is LoadType.PYTORCH:
        # 1, from a PyTorch model, which has loaded a pretrained model
        # note that you can choose "turbo" or "onnxrt" as backend
        # "turbo" is a hand-crafted implementation and optimized with OMP.
        tt_model = turbo_transformers.BertModel.from_torch(
            model, test_device, "turbo")
    elif loadtype is LoadType.PRETRAINED:
        # 2. directly load from checkpoint (torch saved model)
        tt_model = turbo_transformers.BertModel.from_pretrained(
            model_id, test_device)
    elif loadtype is LoadType.NPZ:
        # 3. load model from npz
        if len(sys.argv) == 2:
            try:
                print(sys.argv[1])
                in_file = sys.argv[1]
            except:
                sys.exit("ERROR. can not open ", sys.argv[1])
        else:
            in_file = "/workspace/bert_torch.npz"
        tt_model = turbo_transformers.BertModel.from_npz(
            in_file, cfg, test_device)
    else:
        raise ("LoadType is not supported")

    start_time = time.time()
    for _ in range(10):
        res = tt_model(
            input_ids,
            token_type_ids=segment_ids)  # sequence_output, pooled_output
    end_time = time.time()

    print("turbo bert sequence output:", res[0][:, 0, :])
    print("turbo bert pooler output: ", res[1])  # pooled_output
    print("\nturbo time consum: {}".format(end_time - start_time))
    assert (torch.max(torch.abs(res[0] - torch_res[0])) < 0.2)
        return BertForSequenceClassification(bertmodel, model.classifier)

    @staticmethod
    def from_pretrained(model_id_or_path: str,
                        device: Optional[torch.device] = None):
        # First, Use the function of from_pretrained to load the model you trained.
        torch_model = TorchBertForSequenceClassification.from_pretrained(
            model_id_or_path)
        # Then, Use the init function of the acceleration model to get it.
        model = BertForSequenceClassification.from_torch(torch_model, device)
        model._torch_model = torch_model  # prevent destroy torch model.
        return model


# use 4 threads for BERT inference
turbo_transformers.set_num_threads(4)

model_id = os.path.join(
    os.path.dirname(__file__),
    'test-seq-classification-model')  # the model of huggingface's path
tokenizer = BertTokenizer.from_pretrained(
    model_id)  # the initialization of tokenizer
turbo_model = BertForSequenceClassification.from_pretrained(
    model_id,
    torch.device('cpu:0'))  # the initialization of the acceleration model

# predict after loading the model
input_ids = torch.tensor(
    tokenizer.encode('测试一下bert模型的性能和精度是不是符合要求?',
                     add_special_tokens=True)).unsqueeze(0)
torch_result = turbo_model(input_ids)