def test_TFBertForMaskedLM(self): from transformers import BertConfig, TFBertForMaskedLM keras.backend.clear_session() # pretrained_weights = 'bert-base-uncased' tokenizer_file = 'bert_bert-base-uncased.pickle' tokenizer = self._get_tokenzier(tokenizer_file) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) config = BertConfig() model = TFBertForMaskedLM(config) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
def __init__(self, index_type="mlm", model_path="bert-base-uncased", **kwargs): Expander.__init__(self, index_type) self.candidate_pos = ["NOUN", "ADJ", "ADV"] self.model_path = model_path allowed_keys = list(self.__dict__.keys()) self.__dict__.update((k, v) for k, v in kwargs.items() if k in allowed_keys) rejected_keys = set(kwargs.keys()) - set(allowed_keys) if rejected_keys: raise ValueError( "Invalid arguments in ElasticSearchRetriever constructor:{}".format(rejected_keys)) logger.info( ">> loading HF model for Query Expansion from " + model_path) self.tokenizer = AutoTokenizer.from_pretrained( self.model_path, use_fast=True) self.model = TFBertForMaskedLM.from_pretrained( self.model_path, from_pt=True) logger.info(">> Loading Spacy NLP model ") try: self.nlp = spacy.load('en_core_web_md') except OSError: logger.info( "Downloading language model for the spaCy POS tagger (don't worry, this will only happen once)") from spacy.cli import download download('en_core_web_md') self.nlp = spacy.load('en_core_web_md')
def _init_model(self): """ Initializes model. """ if self.model_dir: logger.status_update("Loading BERT model at {}...".format( self.model_dir)) self.model = TFBertForMaskedLM.from_pretrained(self.model_dir, from_pt=True, config=self.config) elif self.model_name: logger.status_update("Loading BERT model {}...".format( self.model_name)) self.model = TFBertForMaskedLM.from_pretrained(self.model_name, config=self.config) return self.model
def main(raw_args=None): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, required=True, help="model name e.g. xlnet-tiny-chinese") parser.add_argument("--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model") parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/<pytorch-model-name>.bin") parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") args = parser.parse_args(raw_args) # Load the PyTorch model in TensorFlow tf_model = TFBertForMaskedLM.from_pretrained(args.cache_dir, from_pt=True) # Save the TensorFlow model tf_model.save_pretrained(args.tf_cache_dir)
def __init__(self): self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") self.bertmodel = 'bert-large-uncased' self.tokenizer = BertTokenizer.from_pretrained(self.bertmodel) self.model = TFBertForMaskedLM.from_pretrained(self.bertmodel).to(self.device) self.model.eval()
def _build_masked_lm_from_huggingface(self): from transformers import TFBertForMaskedLM model = TFBertForMaskedLM.from_pretrained( os.path.join(os.environ["PYTORCH_MODEL_PATH"], "bert_uncased_L-6_H-768_A-12-pytorch"), from_pt=True, ) return model
def test_TFBertForMaskedLM(self): from transformers import BertTokenizer, TFBertForMaskedLM pretrained_weights = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(pretrained_weights) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) model = TFBertForMaskedLM.from_pretrained(pretrained_weights) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
def language_decoder(args): lang_model = TFBertForMaskedLM.from_pretrained( args.model_name, cache_dir='/scratch/gpfs/zzada/cache-tf') d_size = lang_model.config.hidden_size v_size = lang_model.config.vocab_size lang_decoder = lang_model.mlm lang_decoder.trainable = False inputs = Input((d_size, )) x = Reshape((1, d_size))(inputs) x = lang_decoder(x) x = Reshape((v_size, ))(x) # x = Lambda(lambda z: tf.gather(z, vocab_indices, axis=-1))(x) x = Activation('softmax')(x) lm_decoder = Model(inputs=inputs, outputs=x) lm_decoder.summary() return lm_decoder
def main(args): root = os.environ.get('BASE_DIR') tmp = root + "/models/tmp/" + args.model savepath = root + "/models/" + args.model for folder in [tmp, savepath]: if not os.path.exists(folder): os.makedirs(folder) # download transformers model and save in tmp folder model = BertForMaskedLM.from_pretrained(args.model) model.save_pretrained(tmp) # Load the PyTorch model in TensorFlow tf_model = TFBertForMaskedLM.from_pretrained(tmp, from_pt=True) # Save the TensorFlow model tf.saved_model.save(tf_model, savepath) # Download needed files url = "https://s3.amazonaws.com/models.huggingface.co/bert/" + args.model + "/" wget.download(url + 'config.json', savepath) wget.download(url + 'vocab.txt', savepath) #rename files os.rename(savepath + "/config.json", savepath + "/bert_config.json") os.rename(savepath + "/variables/variables.data-00000-of-00001", savepath + "/bert_model.ckpt.data-00000-of-00001") os.rename(savepath + "/variables/variables.index", savepath + "/bert_model.ckpt.index") #remove useless stuff os.rmdir(savepath + "/assets") os.rmdir(savepath + "/variables") os.remove(savepath + "/saved_model.pb") shutil.rmtree("./models/tmp")
def load_model(model_path): return ( TFBertForMaskedLM.from_pretrained(model_path), BertTokenizer.from_pretrained(model_path), )
def get_custom_MLMmodel(num_tokens): model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.resize_token_embeddings(num_tokens) model.save_pretrained('tmp/CustomModel/') return TFBertForMaskedLM.from_pretrained('tmp/CustomModel', from_pt=True)
def download_bert_model(): return TFBertForMaskedLM.from_pretrained("bert-base-cased")
if __name__ == '__main__': model_path = "../tcdata/bert/" tokenizer = BertTokenizer.from_pretrained("../tcdata/bert/vocab.txt") model_config = BertConfig.from_pretrained("../tcdata/bert/config.json") # model_config.output_attentions = False # model_config.output_hidden_states = False # model_config.use_cache = True # # # bert_model = TFBertModel.from_pretrained(pretrained_model_name_or_path=model_path, from_pt=False, # config=model_config, cache_dir="../user_data/temp") # model = TFBertForMaskedLM(config=model_config) # model.bert = bert_model # model.resize_token_embeddings(len(tokenizer)) model = TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path=model_path, from_pt=False, config=model_config, cache_dir="../user_data/temp") model.resize_token_embeddings(len(tokenizer)) # # inputs = tokenizer("中国的首都是[MASK]", return_tensors="tf") # inputs["labels"] = tokenizer("中国的首都是北京", return_tensors="tf")["input_ids"] inputs = tokenizer.encode("中国的首都是[MASK]", return_tensors="tf") # print(tokenizer.tokenize("中国的首都是[MASK]")) outputs = model(inputs) # print(outputs) # exit(0) o1 = tf.argmax(outputs.logits[0], axis=1) print(o1) print(tokenizer.decode(o1))
import tensorflow as tf import numpy as np import jsonpickle from tensorflow.python.keras.layers import Softmax from transformers import BertTokenizer, TFBertForMaskedLM from src.spacy_utils import LANG_MODEL, PatternNotFoundException from src.semantic_sequence import SemanticSequence, MASK from src.qualia_structure import CreationStrategy, QualiaElement, DebugQualiaStructure, Role from spacy.lang.en.stop_words import STOP_WORDS NAME_OF_MODEL = 'bert-base-cased' tokenizer = BertTokenizer.from_pretrained(NAME_OF_MODEL) MODEL = TFBertForMaskedLM.from_pretrained(NAME_OF_MODEL, return_dict=True) class NumpyFloatHandler(jsonpickle.handlers.BaseHandler): ''' Handler to convert numpy floats to string. Otherwise would be printed as None. ''' def flatten(self, obj, data): return str(obj) jsonpickle.handlers.registry.register(np.float, NumpyFloatHandler) jsonpickle.handlers.registry.register(np.float32, NumpyFloatHandler) jsonpickle.handlers.registry.register(np.float64, NumpyFloatHandler)
┃ ┫ ┫ ┃ ┫ ┫ ┗━┻━┛ ┗━┻━┛ """ import tensorflow as tf import numpy as np for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) from transformers import BertTokenizer, TFBertForMaskedLM tokenizer = BertTokenizer.from_pretrained('bert-base-cased') model = TFBertForMaskedLM.from_pretrained('bert-base-cased', return_dict=True) inputs = tokenizer("The capital of France is [MASK].", return_tensors="tf") outputs = model(inputs) logits = outputs.logits output = np.argmax(logits[0][6]) o1 = tokenizer.decode(int(output)) inputs = tokenizer("The capital of [MASK] is BeiJing.", return_tensors="tf") outputs = model(inputs) logits = outputs.logits output = np.argmax(logits[0][4])
def sentence_generator(): model_path = "./data/bert-large-cased-whole-word-masking" model = TFBertForMaskedLM.from_pretrained(model_path) tokenizer = BertTokenizer.from_pretrained(model_path) return RhymeGenerator(model, tokenizer)
import json import tensorflow as tf from transformers import TFBertForMaskedLM, BertTokenizer from utils import tokenize_and_label, train_test_split from sklearn.metrics import classification_report lm_model = TFBertForMaskedLM.from_pretrained( '../SavedModels/DiBERT') # For fine tuned bert #lm_model = TFBertForMaskedLM.from_pretrained('bert-base-uncased') For pretrained bert tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer.add_tokens(["[STARTQ]", "[ENDQ]", "[URL]"]) lm_model.resize_token_embeddings(len(tokenizer)) class TaskModel(tf.keras.models.Model): def __init__(self, trained_lm_model, num_classes=5): super(TaskModel, self).__init__() self.encoder = trained_lm_model.layers[0] self.prediction = tf.keras.layers.Dense(num_classes, activation='softmax') def call(self, inputs): encoded_seq, _ = self.encoder(inputs) return self.prediction(encoded_seq) task_model = TaskModel(lm_model) optimizer = tf.keras.optimizers.Adam(lr=0.00001) with open('../Data/annotated_threads.json', 'r') as f: data = json.load(f)