def test_infer_dynamic_axis_pytorch(self): """ Validate the dynamic axis generated for each parameters are correct """ from transformers import BertModel model = BertModel(BertConfig.from_pretrained("lysandre/tiny-bert-random")) tokenizer = BertTokenizerFast.from_pretrained("lysandre/tiny-bert-random") self._test_infer_dynamic_axis(model, tokenizer, "pt")
def __init__(self, file_path, max_len=128): logging.info('[+] Init Data.') self.data = pd.read_csv(file_path) logging.info('[+] Load Data: Done.') self.tokenizer = BertTokenizerFast.from_pretrained("kykim/gpt3-kor-small_based_on_gpt2") self.max_len = max_len self.first = True
def test_infer_dynamic_axis_tf(self): """ Validate the dynamic axis generated for each parameters are correct """ from transformers import TFBertModel model = TFBertModel(BertConfig.from_pretrained("bert-base-cased")) tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased") self._test_infer_dynamic_axis(model, tokenizer, "tf")
def __init__(self, encoder: str = None, device: str = 'cpu', prefix: str = "question:"): self.device = device self.model = BertModel.from_pretrained(encoder) self.model.to(self.device) self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") self.prefix = prefix
def getBertTokenizer(model): if model == 'bert-base-uncased': tokenizer = BertTokenizerFast.from_pretrained(model) elif model == 'distilbert-base-uncased': tokenizer = DistilBertTokenizerFast.from_pretrained(model) else: raise ValueError(f'Model: {model} not recognized.') return tokenizer
def test_batch_encoding_is_fast(self): tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased") tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased") with self.subTest("Python Tokenizer"): self.assertFalse(tokenizer_p("Small example to_encode").is_fast) with self.subTest("Rust Tokenizer"): self.assertTrue(tokenizer_r("Small example to_encode").is_fast)
def __init__(self, num_features): super().__init__() bert_model_path=os.getcwd()+'/app/models/binaries'+'/bert_saved_model/' model = BertForSequenceClassification.from_pretrained(bert_model_path) tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased',model_max_length=256) self.explainer = LimeTextExplainer(class_names=['Negative','Positive']) self.predict = Predict(model,tokenizer) self.num_features = num_features
def __init__(self, hparams: Union[Dict, Namespace]): # NOTE: internal code may pass hparams as dict **kwargs if isinstance(hparams, Dict): hparams = Namespace(**hparams) self.label_ids_to_label = LabelTokenAligner.get_ids_to_label(hparams.labels) num_labels = len(self.label_ids_to_label) super().__init__() # Enable to access arguments via self.hparams self.save_hyperparameters(hparams) self.step_count = 0 self.output_dir = Path(self.hparams.output_dir) self.cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None if self.cache_dir is not None and not os.path.exists(self.hparams.cache_dir): os.mkdir(self.cache_dir) # AutoTokenizer # trf>=4.0.0: PreTrainedTokenizerFast by default # NOTE: AutoTokenizer doesn't load PreTrainedTokenizerFast... self.tokenizer_name = self.hparams.model_name_or_path self.tokenizer = BertTokenizerFast.from_pretrained( self.tokenizer_name, cache_dir=self.cache_dir, # tokenize_chinese_chars=False, # Need to pretrain tokenizer # strip_accents=False, ) # AutoConfig config_name = self.hparams.model_name_or_path self.config: PretrainedConfig = BertConfig.from_pretrained( config_name, **({"num_labels": num_labels} if num_labels is not None else {}), cache_dir=self.cache_dir, ) extra_model_params = ( "encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout", ) for p in extra_model_params: if getattr(self.hparams, p, None) and hasattr(self.config, p): setattr(self.config, p, getattr(self.hparams, p, None)) # AutoModelForTokenClassification self.model: PreTrainedModel = BertForTokenClassification.from_pretrained( self.hparams.model_name_or_path, from_tf=bool(".ckpt" in self.hparams.model_name_or_path), config=self.config, cache_dir=self.cache_dir, ) if self.hparams.freeze_pretrained: for name, param in self.model.named_parameters(): if "classifier" not in name: param.requires_grad = False
def __init__(self, hparams, **kwargs): super().__init__() self.hparams = hparams self.kogpt3 = GPT2LMHeadModel.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.loss_function = nn.CrossEntropyLoss(reduction='none') self.neg = -1e18 self.tokenizer = BertTokenizerFast.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2")
def __init__(self, config: Union[Dict[str, Dict[str, Any]], str]): """ Create wrapper with configuration. :param config: config in dictionary format or path to config file (.yml) """ if isinstance(config, str): try: f = open(config, "r") except Exception as ex: raise RuntimeError( f"Cannot read config file from {config}: {ex}") self.config_file_path = config config = yaml.load(f) self.config = config self.util_config = config.get("util", None) model_config_dict = config.get("model", None) if not model_config_dict: raise ValueError(f"Config file should have 'model' attribute") self.dataset_config = model_config_dict if model_config_dict["device"] is not None: self.device = torch.device( model_config_dict["device"]) if torch.cuda.is_available( ) else torch.device("cpu") model_config_attributes = ["model", "intents", "entities"] # model_config_dict = {k: v for k, v in model_config_dict.items() if k in model_config_attributes} self.intents = model_config_dict["intents"] self.entities = ["O"] + model_config_dict["entities"] self.model_config = DIETClassifierConfig( **{ k: v for k, v in model_config_dict.items() if k in model_config_attributes }) training_config_dict = config.get("training", None) if not training_config_dict: raise ValueError(f"Config file should have 'training' attribute") self.training_config = training_config_dict self.tokenizer = BertTokenizerFast.from_pretrained( model_config_dict["tokenizer"]) self.model = DIETClassifier(config=self.model_config) self.model.to(self.device) self.softmax = torch.nn.Softmax(dim=-1) self.synonym_dict = {} if not model_config_dict.get( "synonym") else model_config_dict["synonym"]
def inference(onnx_model, model_dir, examples, fast_tokenizer, num_threads): quantized_str = '' if 'quantized' in onnx_model: quantized_str = 'quantized' onnx_inference = [] # pytorch_inference = [] # onnx session options = ort.SessionOptions() options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL options.intra_op_num_threads = 1 print(onnx_model) ort_session = ort.InferenceSession(onnx_model, options) # pytorch pretrained model and tokenizer tokenizer = BertTokenizerFast.from_pretrained(model_dir) tokenizer_str = "BertTokenizerFast" print("**************** {} ONNX inference with batch tokenization and with {} tokenizer****************".format(quantized_str, tokenizer_str)) start_onnx_inference_batch = time.time() start_batch_tokenization = time.time() tokens_dict = tokenizer.batch_encode_plus(examples, max_length=128) total_batch_tokenization_time = time.time() - start_batch_tokenization total_inference_time = 0 total_build_label_time = 0 for i in range(len(examples)): """ Onnx inference with batch tokenization """ if i%100 == 0: print('[inference... ]', i, 'out of ', len(examples)) tokens = get_tokens(tokens_dict, i) #inference start_inference = time.time() ort_outs = ort_session.run(None, tokens) total_inference_time = total_inference_time + (time.time() - start_inference) #build label start_build_label = time.time() torch_onnx_output = torch.tensor(ort_outs[0], dtype=torch.float32) onnx_logits = F.softmax(torch_onnx_output, dim=1) logits_label = torch.argmax(onnx_logits, dim=1) label = logits_label.detach().cpu().numpy() # onnx_inference.append(label[0]) onnx_inference.append( onnx_logits.detach().cpu().numpy()[0].tolist() ) total_build_label_time = total_build_label_time + (time.time() - start_build_label) # print(i, label[0], onnx_logits.detach().cpu().numpy()[0].tolist(), type(onnx_logits.detach().cpu().numpy()[0]) ) end_onnx_inference_batch = time.time() print("Total batch tokenization time (in seconds): ", total_batch_tokenization_time) print("Total inference time (in seconds): ", total_inference_time) print("Total build label time (in seconds): ", total_build_label_time) print("Duration ONNX inference (in seconds) with {} and batch tokenization: ".format(tokenizer_str), end_onnx_inference_batch - start_onnx_inference_batch, (end_onnx_inference_batch - start_onnx_inference_batch)/len(examples)) return onnx_inference
def __init__(self, doc_maxlen): self.tok = BertTokenizerFast.from_pretrained('bert-base-uncased') self.doc_maxlen = doc_maxlen self.D_marker_token, self.D_marker_token_id = '[D]', self.tok.convert_tokens_to_ids( '[unused1]') self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id assert self.D_marker_token_id == 2
def __init__(self): super(LanguageInput, self).__init__() self.berttokenizer = BertTokenizerFast.from_pretrained( 'bert-base-uncased') self.bertmodel = BertModel.from_pretrained('bert-base-uncased') # Do not Finetune for i, p in enumerate(self.bertmodel.parameters()): p.requires_grad = False
def __init__(self): print("Tagger initializing...", flush=True) model_name = 'bert-base-uncased' config = BertConfig.from_pretrained(model_name) config.output_hidden_states = False self.tokenizer = BertTokenizerFast.from_pretrained( pretrained_model_name_or_path=model_name, config=config) self.model = BertForSequenceClassification.from_pretrained( "model_save") print("Tagger initialized...", flush=True)
def shard_and_map(index, filename, num_shards, function, **kwargs): print(f"Sharding on process {index}") shard = nlp.Dataset.from_file(filename).shard( num_shards, index, contiguous=True, load_from_cache_file=load_from_cache_file) print(f"Done sharding on process {index}. Mapping the shard") tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") return shard.map(partial(function, tokenizer=tokenizer), **kwargs)
def __init__(self, args): self.args = args self.parse = Parser() self._token_indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer()} if args.model_type == "elmo": self._token_indexers["elmo_characters"] = ELMoIndexer() self.bert_tokenizer = None if args.model_type == "bert": self.bert_tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_model_dir)
def __init__(self, query_maxlen): self.tok = BertTokenizerFast.from_pretrained( 'bert-base-multilingual-cased') self.query_maxlen = query_maxlen self.Q_marker_token, self.Q_marker_token_id = '[Q]', self.tok.get_vocab( )['[unused21]'] self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id self.mask_token, self.mask_token_id = self.tok.mask_token, self.tok.mask_token_id
def __init__(self): self.tok = BertTokenizerFast.from_pretrained( "bert-base-multilingual-cased") self.D_marker_token, self.D_marker_token_id = "[D]", self.tok.convert_tokens_to_ids( "[unused1]") self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id assert self.D_marker_token_id == 1
def __init__(self, config: Bunch) -> None: super().__init__() self.config = config self.model = BertForSequenceClassification.from_pretrained( config.pretrained_model) bert_tokenizer = BertTokenizerFast.from_pretrained( self.config.pretrained_model) tokenizer = PreTrainedTokenizer(bert_tokenizer, self.config.max_tokens_per_tweet) self.data_processor = DataProcessor(config, tokenizer) self.loss = CrossEntropyLoss()
def get_tokenizer(vocab_size): pretrained_tokenizer_path = Path( 'experiments/tokenizers') / f'{tokenizer_type}-{vocab_size}' logger.info( f'loading {tokenizer_type}-{vocab_size} tokenizer from {pretrained_tokenizer_path}' ) if transformer_type == 'roberta': return RobertaTokenizerFast.from_pretrained( str(pretrained_tokenizer_path), max_len=512) return BertTokenizerFast.from_pretrained(str(pretrained_tokenizer_path), max_len=512)
def __init__(self, model_name: str, tokenizer_name: Optional[str] = None, *, device: int = -1, ): self.model = AutoModelForTokenClassification.from_pretrained(model_name) self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name or model_name) self.device = torch.device('cpu' if device < 0 else f'cuda:{device}') # pylint: disable=no-member self.model.to(self.device)
def __init__(self, dataset: SquadDataset, bert_type: str, lazy=False, return_sample=False, eval=False): self.dataset = dataset self.tokenizer = BertTokenizerFast.from_pretrained(bert_type) self.return_sample = return_sample self.eval = eval if not lazy: self.lazy = True self.dataset = [self[index] for index in trange(0, len(self.dataset), desc="Preprocessing")] self.lazy = lazy
def __init__(self, data, labels): super().__init__() self.data = data # 搞到tokenizer self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese') self.labels = labels # 处理labels,如果lables不够长,则延伸到512,补0 for label in labels: for i in range(len(label), 512): label.append(0)
def __init__(self, query_maxlen): self.tok = BertTokenizerFast.from_pretrained('bert-base-uncased') self.query_maxlen = query_maxlen self.Q_marker_token, self.Q_marker_token_id = '[Q]', self.tok.get_vocab( )['[unused0]'] self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id self.mask_token, self.mask_token_id = self.tok.mask_token, self.tok.mask_token_id assert self.Q_marker_token_id == 1 and self.mask_token_id == 103
def __init__(self, vocabs: Dict[str, Vocabulary], subword: str, use_pos_tag: bool, bert_path: str, transliterate: str, d_model: int, partition: bool, pos_tag_emb_dropout: float, position_emb_dropout: float, bert_emb_dropout: float, emb_dropout: float, language: str, device: torch.device): super(EmbeddingLayer, self).__init__() self.BERT = BertModel.from_pretrained(bert_path) self.tokenizer = BertTokenizerFast.from_pretrained(bert_path) self.bert_hidden_size = self.BERT.config.hidden_size if subword == 'max_pool': self.pool = nn.AdaptiveMaxPool1d(1) elif subword == 'avg_pool': self.pool = nn.AdaptiveAvgPool1d(1) else: self.pool = None self.position_emb_dropout = nn.Dropout(position_emb_dropout) self.bert_emb_dropout = nn.Dropout(bert_emb_dropout) self.emb_dropout = nn.Dropout(emb_dropout) self.layer_norm = nn.LayerNorm(d_model, elementwise_affine=True) self.partition = partition self.d_model = d_model self.d_content = d_model // 2 if partition else d_model self.d_position = d_model - d_model // 2 if partition else d_model self.bert_proj = nn.Linear(self.bert_hidden_size, self.d_content) self.pos_tag_embeddings = None self.pos_tag_emb_dropout = None self.use_pos_tag = use_pos_tag if use_pos_tag: self.pos_tag_emb_dropout = nn.Dropout(pos_tag_emb_dropout) self.pos_tag_embeddings = nn.Embedding( vocabs['pos_tags'].size + 1, self.d_content, padding_idx=vocabs['pos_tags'].size) self.position_embeddings = LearnedPositionalEmbedding(self.d_position, max_len=512) if transliterate == '': self.bert_transliterate = None else: assert transliterate in TRANSLITERATIONS self.bert_transliterate = TRANSLITERATIONS[transliterate] self.subword = subword self.language = language if self.subword == CHARACTER_BASED and self.language == 'chinese': assert not self.use_pos_tag self.pos_tags_vocab = vocabs['pos_tags'] self.labels_vocab = vocabs['labels'] self.device = device
def __init__(self): self.tok = BertTokenizerFast.from_pretrained( "bert-base-multilingual-cased") self.Q_marker_token, self.Q_marker_token_id = "[Q]", self.tok.convert_tokens_to_ids( "[unused0]") self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id self.mask_token, self.mask_token_id = self.tok.mask_token, self.tok.mask_token_id self.query_maxlen = self.tok.model_max_length assert self.Q_marker_token_id == 100 and self.mask_token_id == 103
def __init__(self, query_maxlen): self.tok = BertTokenizerFast.from_pretrained('bert-base-uncased') self.query_maxlen = query_maxlen self.Q_marker_token, self.Q_marker_token_id = '[Q]', self.tok.convert_tokens_to_ids( '[unused0]') self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id self.mask_token, self.mask_token_id = self.tok.mask_token, self.tok.mask_token_id self.nltk_stopwords = nltk.corpus.stopwords.words("english") assert self.Q_marker_token_id == 1 and self.mask_token_id == 103
async def main(message: types.Message): from transformers import pipeline, BertTokenizerFast, AutoModelWithLMHead tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese') model = AutoModelWithLMHead.from_pretrained('ckiplab/gpt2-base-chinese') text_generation = pipeline("text-generation", model=model, tokenizer=tokenizer) text_input = "机器学习是" generated_text = text_generation(text_input, max_length=50, do_sample=False)[0] await message.reply(text=generated_text)
def get_tokenizer() -> BertTokenizerFast: """ Get the trained BERT tokenizer. """ try: return BertTokenizerFast.from_pretrained( os.path.join(this_dir, 'data/models/fine-tuned-paragraph-classifier'), do_lower_case=False ) except OSError: raise OSError('Failed to load model. Did you download the models by ' '`python -m synthesis_classifier.model download`?')
def __init__(self, query_maxlen): self.tok = BertTokenizerFast.from_pretrained( 'bert-base-multilingual-uncased') self.query_maxlen = query_maxlen self.Q_marker_token, self.Q_marker_token_id = '[Q]', self.tok.convert_tokens_to_ids( '[unused0]') self.cls_token, self.cls_token_id = self.tok.cls_token, self.tok.cls_token_id self.sep_token, self.sep_token_id = self.tok.sep_token, self.tok.sep_token_id self.mask_token, self.mask_token_id = self.tok.mask_token, self.tok.mask_token_id assert self.Q_marker_token_id == 100 and self.mask_token_id == 103