Пример #1
0
    def test_infer_dynamic_axis_pytorch(self):
        """
        Validate the dynamic axis generated for each parameters are correct
        """
        from transformers import BertModel

        model = BertModel(BertConfig.from_pretrained("lysandre/tiny-bert-random"))
        tokenizer = BertTokenizerFast.from_pretrained("lysandre/tiny-bert-random")
        self._test_infer_dynamic_axis(model, tokenizer, "pt")
Пример #2
0
    def __init__(self, file_path, max_len=128):
        logging.info('[+] Init Data.')
        self.data = pd.read_csv(file_path)
        logging.info('[+] Load Data: Done.')
        self.tokenizer = BertTokenizerFast.from_pretrained("kykim/gpt3-kor-small_based_on_gpt2")

        
        self.max_len = max_len
        self.first = True
Пример #3
0
    def test_infer_dynamic_axis_tf(self):
        """
        Validate the dynamic axis generated for each parameters are correct
        """
        from transformers import TFBertModel

        model = TFBertModel(BertConfig.from_pretrained("bert-base-cased"))
        tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
        self._test_infer_dynamic_axis(model, tokenizer, "tf")
Пример #4
0
 def __init__(self,
              encoder: str = None,
              device: str = 'cpu',
              prefix: str = "question:"):
     self.device = device
     self.model = BertModel.from_pretrained(encoder)
     self.model.to(self.device)
     self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
     self.prefix = prefix
Пример #5
0
def getBertTokenizer(model):
    if model == 'bert-base-uncased':
        tokenizer = BertTokenizerFast.from_pretrained(model)
    elif model == 'distilbert-base-uncased':
        tokenizer = DistilBertTokenizerFast.from_pretrained(model)
    else:
        raise ValueError(f'Model: {model} not recognized.')

    return tokenizer
Пример #6
0
    def test_batch_encoding_is_fast(self):
        tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased")
        tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")

        with self.subTest("Python Tokenizer"):
            self.assertFalse(tokenizer_p("Small example to_encode").is_fast)

        with self.subTest("Rust Tokenizer"):
            self.assertTrue(tokenizer_r("Small example to_encode").is_fast)
Пример #7
0
 def __init__(self, num_features):
     super().__init__()
     bert_model_path=os.getcwd()+'/app/models/binaries'+'/bert_saved_model/'
     model = BertForSequenceClassification.from_pretrained(bert_model_path)       
     tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased',model_max_length=256)
     
     self.explainer = LimeTextExplainer(class_names=['Negative','Positive'])
     self.predict = Predict(model,tokenizer)
     self.num_features = num_features
Пример #8
0
    def __init__(self, hparams: Union[Dict, Namespace]):
        # NOTE: internal code may pass hparams as dict **kwargs
        if isinstance(hparams, Dict):
            hparams = Namespace(**hparams)

        self.label_ids_to_label = LabelTokenAligner.get_ids_to_label(hparams.labels)
        num_labels = len(self.label_ids_to_label)

        super().__init__()
        # Enable to access arguments via self.hparams
        self.save_hyperparameters(hparams)

        self.step_count = 0
        self.output_dir = Path(self.hparams.output_dir)
        self.cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
        if self.cache_dir is not None and not os.path.exists(self.hparams.cache_dir):
            os.mkdir(self.cache_dir)

        # AutoTokenizer
        # trf>=4.0.0: PreTrainedTokenizerFast by default
        # NOTE: AutoTokenizer doesn't load PreTrainedTokenizerFast...
        self.tokenizer_name = self.hparams.model_name_or_path
        self.tokenizer = BertTokenizerFast.from_pretrained(
            self.tokenizer_name,
            cache_dir=self.cache_dir,
            # tokenize_chinese_chars=False,  # Need to pretrain tokenizer
            # strip_accents=False,
        )

        # AutoConfig
        config_name = self.hparams.model_name_or_path
        self.config: PretrainedConfig = BertConfig.from_pretrained(
            config_name,
            **({"num_labels": num_labels} if num_labels is not None else {}),
            cache_dir=self.cache_dir,
        )
        extra_model_params = (
            "encoder_layerdrop",
            "decoder_layerdrop",
            "dropout",
            "attention_dropout",
        )
        for p in extra_model_params:
            if getattr(self.hparams, p, None) and hasattr(self.config, p):
                setattr(self.config, p, getattr(self.hparams, p, None))

        # AutoModelForTokenClassification
        self.model: PreTrainedModel = BertForTokenClassification.from_pretrained(
            self.hparams.model_name_or_path,
            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
            config=self.config,
            cache_dir=self.cache_dir,
        )
        if self.hparams.freeze_pretrained:
            for name, param in self.model.named_parameters():
                if "classifier" not in name:
                    param.requires_grad = False
Пример #9
0
 def __init__(self, hparams, **kwargs):
     super().__init__()
     self.hparams = hparams
     self.kogpt3 = GPT2LMHeadModel.from_pretrained(
         "kykim/gpt3-kor-small_based_on_gpt2")
     self.loss_function = nn.CrossEntropyLoss(reduction='none')
     self.neg = -1e18
     self.tokenizer = BertTokenizerFast.from_pretrained(
         "kykim/gpt3-kor-small_based_on_gpt2")
Пример #10
0
    def __init__(self, config: Union[Dict[str, Dict[str, Any]], str]):
        """
        Create wrapper with configuration.

        :param config: config in dictionary format or path to config file (.yml)
        """
        if isinstance(config, str):
            try:
                f = open(config, "r")
            except Exception as ex:
                raise RuntimeError(
                    f"Cannot read config file from {config}: {ex}")
            self.config_file_path = config
            config = yaml.load(f)

        self.config = config
        self.util_config = config.get("util", None)

        model_config_dict = config.get("model", None)
        if not model_config_dict:
            raise ValueError(f"Config file should have 'model' attribute")

        self.dataset_config = model_config_dict

        if model_config_dict["device"] is not None:
            self.device = torch.device(
                model_config_dict["device"]) if torch.cuda.is_available(
                ) else torch.device("cpu")

        model_config_attributes = ["model", "intents", "entities"]
        # model_config_dict = {k: v for k, v in model_config_dict.items() if k in model_config_attributes}

        self.intents = model_config_dict["intents"]
        self.entities = ["O"] + model_config_dict["entities"]

        self.model_config = DIETClassifierConfig(
            **{
                k: v
                for k, v in model_config_dict.items()
                if k in model_config_attributes
            })

        training_config_dict = config.get("training", None)
        if not training_config_dict:
            raise ValueError(f"Config file should have 'training' attribute")

        self.training_config = training_config_dict
        self.tokenizer = BertTokenizerFast.from_pretrained(
            model_config_dict["tokenizer"])
        self.model = DIETClassifier(config=self.model_config)

        self.model.to(self.device)

        self.softmax = torch.nn.Softmax(dim=-1)

        self.synonym_dict = {} if not model_config_dict.get(
            "synonym") else model_config_dict["synonym"]
def inference(onnx_model, model_dir, examples, fast_tokenizer, num_threads):
    quantized_str = ''
    if 'quantized' in onnx_model:
        quantized_str = 'quantized'
    onnx_inference = []
#     pytorch_inference = []
    # onnx session
    options = ort.SessionOptions()
    options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
    options.intra_op_num_threads = 1
    print(onnx_model)
    ort_session = ort.InferenceSession(onnx_model, options)

    # pytorch pretrained model and tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(model_dir)
    tokenizer_str = "BertTokenizerFast"

    print("**************** {} ONNX inference with batch tokenization and with {} tokenizer****************".format(quantized_str, tokenizer_str))
    start_onnx_inference_batch = time.time()
    start_batch_tokenization = time.time()
    tokens_dict = tokenizer.batch_encode_plus(examples, max_length=128)
    total_batch_tokenization_time = time.time() - start_batch_tokenization
    total_inference_time = 0
    total_build_label_time = 0
    for i in range(len(examples)):
        """
        Onnx inference with batch tokenization
        """
        
        if i%100 == 0: 
            print('[inference... ]', i, 'out of ', len(examples))
        
        tokens = get_tokens(tokens_dict, i)
        #inference
        start_inference = time.time()
        ort_outs = ort_session.run(None, tokens)
        total_inference_time = total_inference_time + (time.time() - start_inference)
        #build label
        start_build_label = time.time()
        torch_onnx_output = torch.tensor(ort_outs[0], dtype=torch.float32)
        onnx_logits = F.softmax(torch_onnx_output, dim=1)
        logits_label = torch.argmax(onnx_logits, dim=1)
        label = logits_label.detach().cpu().numpy()
#         onnx_inference.append(label[0])
        onnx_inference.append( onnx_logits.detach().cpu().numpy()[0].tolist() )
        total_build_label_time = total_build_label_time + (time.time() - start_build_label)
#         print(i, label[0], onnx_logits.detach().cpu().numpy()[0].tolist(), type(onnx_logits.detach().cpu().numpy()[0]) )

    end_onnx_inference_batch = time.time()
    print("Total batch tokenization time (in seconds): ", total_batch_tokenization_time)
    print("Total inference time (in seconds): ", total_inference_time)
    print("Total build label time (in seconds): ", total_build_label_time)
    print("Duration ONNX inference (in seconds) with {} and batch tokenization: ".format(tokenizer_str), end_onnx_inference_batch - start_onnx_inference_batch, (end_onnx_inference_batch - start_onnx_inference_batch)/len(examples))

    return onnx_inference
Пример #12
0
    def __init__(self, doc_maxlen):
        self.tok = BertTokenizerFast.from_pretrained('bert-base-uncased')
        self.doc_maxlen = doc_maxlen

        self.D_marker_token, self.D_marker_token_id = '[D]', self.tok.convert_tokens_to_ids(
            '[unused1]')
        self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id
        self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id

        assert self.D_marker_token_id == 2
Пример #13
0
    def __init__(self):
        super(LanguageInput, self).__init__()

        self.berttokenizer = BertTokenizerFast.from_pretrained(
            'bert-base-uncased')
        self.bertmodel = BertModel.from_pretrained('bert-base-uncased')

        # Do not Finetune
        for i, p in enumerate(self.bertmodel.parameters()):
            p.requires_grad = False
Пример #14
0
 def __init__(self):
     print("Tagger initializing...", flush=True)
     model_name = 'bert-base-uncased'
     config = BertConfig.from_pretrained(model_name)
     config.output_hidden_states = False
     self.tokenizer = BertTokenizerFast.from_pretrained(
         pretrained_model_name_or_path=model_name, config=config)
     self.model = BertForSequenceClassification.from_pretrained(
         "model_save")
     print("Tagger initialized...", flush=True)
Пример #15
0
def shard_and_map(index, filename, num_shards, function, **kwargs):
    print(f"Sharding on process {index}")
    shard = nlp.Dataset.from_file(filename).shard(
        num_shards,
        index,
        contiguous=True,
        load_from_cache_file=load_from_cache_file)
    print(f"Done sharding on process {index}. Mapping the shard")
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    return shard.map(partial(function, tokenizer=tokenizer), **kwargs)
Пример #16
0
    def __init__(self, args):
        self.args = args
        self.parse = Parser()
        self._token_indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer()}
        if args.model_type == "elmo":
            self._token_indexers["elmo_characters"] = ELMoIndexer()

        self.bert_tokenizer = None
        if args.model_type == "bert":
            self.bert_tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_model_dir)
Пример #17
0
    def __init__(self, query_maxlen):
        self.tok = BertTokenizerFast.from_pretrained(
            'bert-base-multilingual-cased')
        self.query_maxlen = query_maxlen

        self.Q_marker_token, self.Q_marker_token_id = '[Q]', self.tok.get_vocab(
        )['[unused21]']
        self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id
        self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id
        self.mask_token, self.mask_token_id = self.tok.mask_token, self.tok.mask_token_id
Пример #18
0
    def __init__(self):
        self.tok = BertTokenizerFast.from_pretrained(
            "bert-base-multilingual-cased")

        self.D_marker_token, self.D_marker_token_id = "[D]", self.tok.convert_tokens_to_ids(
            "[unused1]")
        self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id
        self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id

        assert self.D_marker_token_id == 1
Пример #19
0
 def __init__(self, config: Bunch) -> None:
     super().__init__()
     self.config = config
     self.model = BertForSequenceClassification.from_pretrained(
         config.pretrained_model)
     bert_tokenizer = BertTokenizerFast.from_pretrained(
         self.config.pretrained_model)
     tokenizer = PreTrainedTokenizer(bert_tokenizer,
                                     self.config.max_tokens_per_tweet)
     self.data_processor = DataProcessor(config, tokenizer)
     self.loss = CrossEntropyLoss()
Пример #20
0
def get_tokenizer(vocab_size):
    pretrained_tokenizer_path = Path(
        'experiments/tokenizers') / f'{tokenizer_type}-{vocab_size}'
    logger.info(
        f'loading {tokenizer_type}-{vocab_size} tokenizer from {pretrained_tokenizer_path}'
    )
    if transformer_type == 'roberta':
        return RobertaTokenizerFast.from_pretrained(
            str(pretrained_tokenizer_path), max_len=512)
    return BertTokenizerFast.from_pretrained(str(pretrained_tokenizer_path),
                                             max_len=512)
Пример #21
0
    def __init__(self,
        model_name: str,
        tokenizer_name: Optional[str] = None,
        *,
        device: int = -1,
    ):
        self.model = AutoModelForTokenClassification.from_pretrained(model_name)
        self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name or model_name)

        self.device = torch.device('cpu' if device < 0 else f'cuda:{device}') # pylint: disable=no-member
        self.model.to(self.device)
Пример #22
0
    def __init__(self, dataset: SquadDataset, bert_type: str, lazy=False, return_sample=False, eval=False):
        self.dataset = dataset
        self.tokenizer = BertTokenizerFast.from_pretrained(bert_type)
        self.return_sample = return_sample
        self.eval = eval

        if not lazy:
            self.lazy = True
            self.dataset = [self[index] for index in trange(0, len(self.dataset), desc="Preprocessing")]

        self.lazy = lazy
Пример #23
0
    def __init__(self, data, labels):
        super().__init__()
        self.data = data
        # 搞到tokenizer
        self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
        self.labels = labels

        # 处理labels,如果lables不够长,则延伸到512,补0
        for label in labels:
            for i in range(len(label), 512):
                label.append(0)
Пример #24
0
    def __init__(self, query_maxlen):
        self.tok = BertTokenizerFast.from_pretrained('bert-base-uncased')
        self.query_maxlen = query_maxlen

        self.Q_marker_token, self.Q_marker_token_id = '[Q]', self.tok.get_vocab(
        )['[unused0]']
        self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id
        self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id
        self.mask_token, self.mask_token_id = self.tok.mask_token, self.tok.mask_token_id

        assert self.Q_marker_token_id == 1 and self.mask_token_id == 103
Пример #25
0
    def __init__(self, vocabs: Dict[str, Vocabulary], subword: str,
                 use_pos_tag: bool, bert_path: str, transliterate: str,
                 d_model: int, partition: bool, pos_tag_emb_dropout: float,
                 position_emb_dropout: float, bert_emb_dropout: float,
                 emb_dropout: float, language: str, device: torch.device):
        super(EmbeddingLayer, self).__init__()

        self.BERT = BertModel.from_pretrained(bert_path)
        self.tokenizer = BertTokenizerFast.from_pretrained(bert_path)
        self.bert_hidden_size = self.BERT.config.hidden_size
        if subword == 'max_pool':
            self.pool = nn.AdaptiveMaxPool1d(1)
        elif subword == 'avg_pool':
            self.pool = nn.AdaptiveAvgPool1d(1)
        else:
            self.pool = None

        self.position_emb_dropout = nn.Dropout(position_emb_dropout)
        self.bert_emb_dropout = nn.Dropout(bert_emb_dropout)
        self.emb_dropout = nn.Dropout(emb_dropout)
        self.layer_norm = nn.LayerNorm(d_model, elementwise_affine=True)

        self.partition = partition
        self.d_model = d_model
        self.d_content = d_model // 2 if partition else d_model
        self.d_position = d_model - d_model // 2 if partition else d_model
        self.bert_proj = nn.Linear(self.bert_hidden_size, self.d_content)
        self.pos_tag_embeddings = None
        self.pos_tag_emb_dropout = None
        self.use_pos_tag = use_pos_tag
        if use_pos_tag:
            self.pos_tag_emb_dropout = nn.Dropout(pos_tag_emb_dropout)
            self.pos_tag_embeddings = nn.Embedding(
                vocabs['pos_tags'].size + 1,
                self.d_content,
                padding_idx=vocabs['pos_tags'].size)

        self.position_embeddings = LearnedPositionalEmbedding(self.d_position,
                                                              max_len=512)

        if transliterate == '':
            self.bert_transliterate = None
        else:
            assert transliterate in TRANSLITERATIONS
            self.bert_transliterate = TRANSLITERATIONS[transliterate]
        self.subword = subword
        self.language = language
        if self.subword == CHARACTER_BASED and self.language == 'chinese':
            assert not self.use_pos_tag

        self.pos_tags_vocab = vocabs['pos_tags']
        self.labels_vocab = vocabs['labels']
        self.device = device
Пример #26
0
    def __init__(self):
        self.tok = BertTokenizerFast.from_pretrained(
            "bert-base-multilingual-cased")

        self.Q_marker_token, self.Q_marker_token_id = "[Q]", self.tok.convert_tokens_to_ids(
            "[unused0]")
        self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id
        self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id
        self.mask_token, self.mask_token_id = self.tok.mask_token, self.tok.mask_token_id
        self.query_maxlen = self.tok.model_max_length

        assert self.Q_marker_token_id == 100 and self.mask_token_id == 103
Пример #27
0
    def __init__(self, query_maxlen):
        self.tok = BertTokenizerFast.from_pretrained('bert-base-uncased')
        self.query_maxlen = query_maxlen

        self.Q_marker_token, self.Q_marker_token_id = '[Q]', self.tok.convert_tokens_to_ids(
            '[unused0]')
        self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id
        self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id
        self.mask_token, self.mask_token_id = self.tok.mask_token, self.tok.mask_token_id
        self.nltk_stopwords = nltk.corpus.stopwords.words("english")

        assert self.Q_marker_token_id == 1 and self.mask_token_id == 103
Пример #28
0
async def main(message: types.Message):
    from transformers import pipeline, BertTokenizerFast, AutoModelWithLMHead
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
    model = AutoModelWithLMHead.from_pretrained('ckiplab/gpt2-base-chinese')
    text_generation = pipeline("text-generation",
                               model=model,
                               tokenizer=tokenizer)
    text_input = "机器学习是"
    generated_text = text_generation(text_input,
                                     max_length=50,
                                     do_sample=False)[0]
    await message.reply(text=generated_text)
Пример #29
0
def get_tokenizer() -> BertTokenizerFast:
    """
    Get the trained BERT tokenizer.
    """
    try:
        return BertTokenizerFast.from_pretrained(
            os.path.join(this_dir, 'data/models/fine-tuned-paragraph-classifier'),
            do_lower_case=False
        )
    except OSError:
        raise OSError('Failed to load model. Did you download the models by '
                      '`python -m synthesis_classifier.model download`?')
Пример #30
0
    def __init__(self, query_maxlen):
        self.tok = BertTokenizerFast.from_pretrained(
            'bert-base-multilingual-uncased')
        self.query_maxlen = query_maxlen

        self.Q_marker_token, self.Q_marker_token_id = '[Q]', self.tok.convert_tokens_to_ids(
            '[unused0]')
        self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id
        self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id
        self.mask_token, self.mask_token_id = self.tok.mask_token, self.tok.mask_token_id

        assert self.Q_marker_token_id == 100 and self.mask_token_id == 103