def text_genration(PADDING_TEXT, prompt): from transformers import TFAutoModelWithLMHead, AutoTokenizer model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased") tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased") # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology # PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family # (except for Alexei and Maria) are discovered. # The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the # remainder of the story. 1883 Western Siberia, # a young Grigori Rasputin is asked by his father and a group of men to perform magic. # Rasputin has a vision and denounces one of the men as a horse thief. Although his # father initially slaps him for making such an accusation, Rasputin watches as the # man is chased outside and beaten. Twenty years later, Rasputin sees a vision of # the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, # with people, even a bishop, begging for his blessing. <eod> </s> <eos>""" # prompt = "Today the weather is really nice and I am planning on " inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf") prompt_length = len( tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60) generated = prompt + tokenizer.decode(outputs[0])[prompt_length:] print(generated) return
def test_lmhead_model_from_pretrained(self): for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) model = TFAutoModelWithLMHead.from_pretrained(model_name) self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForMaskedLM)
def test_lmhead_model_from_pretrained(self): logging.basicConfig(level=logging.INFO) # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in ["bert-base-uncased"]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) model = TFAutoModelWithLMHead.from_pretrained(model_name) self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForMaskedLM)
def test_from_pretrained_identifier(self): model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, from_pt=True) self.assertIsInstance(model, TFBertForMaskedLM) self.assertEqual(model.num_parameters(), 14830) self.assertEqual(model.num_parameters(only_trainable=True), 14830) model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, from_tf=True) self.assertIsInstance(model, BertForMaskedLM) self.assertEqual(model.num_parameters(), 14410) self.assertEqual(model.num_parameters(only_trainable=True), 14410)
def test_from_identifier_from_model_type(self): model = TFAutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER, from_pt=True) self.assertIsInstance(model, TFRobertaForMaskedLM) self.assertEqual(model.num_parameters(), 14830) self.assertEqual(model.num_parameters(only_trainable=True), 14830) model = AutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER, from_tf=True) self.assertIsInstance(model, RobertaForMaskedLM) self.assertEqual(model.num_parameters(), 14410) self.assertEqual(model.num_parameters(only_trainable=True), 14410)
def masked_lang_fill(sequence, no_of_version): from transformers import TFAutoModelWithLMHead, AutoTokenizer import tensorflow as tf tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased") input = tokenizer.encode(sequence, return_tensors="tf") mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1] token_logits = model(input)[0] mask_token_logits = token_logits[0, mask_token_index, :] top_5_tokens = tf.math.top_k(mask_token_logits, no_of_version).indices.numpy() for token in top_5_tokens: print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token]))) return
def Summarization(ARTICLE): from transformers import TFAutoModelWithLMHead, AutoTokenizer model = TFAutoModelWithLMHead.from_pretrained("t5-base") tokenizer = AutoTokenizer.from_pretrained("t5-base") # T5 uses a max_length of 512 so we cut the article to 512 tokens. inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512) summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True) output = tokenizer.decode(summary_ids[0], skip_special_tokens=True) # print(output) return output
def run_generation(args): if args.lib == 'pt': model = AutoModelWithLMHead.from_pretrained(args.model) elif args.lib == 'tf': model = TFAutoModelWithLMHead.from_pretrained(args.model) else: raise ValueError('{} is no lib'.format(args.lib)) tokenizer = AutoTokenizer.from_pretrained(args.model) if args.input: input_text = args.input else: input_text = TEXT print("Text: {}".format(input_text)) tokenized_input_words = tokenizer.encode(input_text, add_special_tokens=False, return_tensors=args.lib) print("BOS: {}".format(tokenizer.bos_token_id)) print("PAD: {}".format(tokenizer.pad_token_id)) print("EOS: {}".format(tokenizer.eos_token_id)) print("Input tokens: {}".format(tokenized_input_words)) torch.manual_seed(0) generated_tokens = model.generate(tokenized_input_words, bos_token_id=tokenizer.bos_token_id, eos_token_ids=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, do_sample=False, no_repeat_ngram_size=2, max_length=40, num_beams=5, early_stopping=True) print("Output tokens: {}".format(generated_tokens)) generated_words = tokenizer.decode(generated_tokens[0], skip_special_tokens=True, clean_up_tokenization_spaces=True) print("Output text: {}".format(generated_words))
def test_from_pretrained_identifier(self): logging.basicConfig(level=logging.INFO) model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER) self.assertIsInstance(model, TFBertForMaskedLM) self.assertEqual(model.num_parameters(), 14830) self.assertEqual(model.num_parameters(only_trainable=True), 14830)
def test_from_pretrained_identifier(self): logging.basicConfig(level=logging.INFO) model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER) self.assertIsInstance(model, TFBertForMaskedLM)
check_output_path(args.output_path, force=True) tokenizer = AutoTokenizer.from_pretrained(args.model_select) dataset = load_dataset(*args.dataset_name.split(", ")) # use num_proc = 6 can give 6x speedup ideally as compared to 1 proc, which is really good stuff for tokenizing many examples # this is the main reason why using HF's datasets instead of torch.Dataset encoded = dataset.map(convert_to_features, batched=True, fn_kwargs={"args": args, "tokenizer": tokenizer}, num_proc=6) columns = ['input_ids', "source_lengths", "target_lengths", 'attention_mask', 'labels', 'decoder_attention_mask'] encoded.set_format(type='tensorflow', columns=columns) if args.do_train: add_filehandler_for_logger(args.output_path, logger, out_name="train") strategy = get_strategy(args, logger) with strategy.scope(): # from_pt to aovid repeated downloading model = TFAutoModelWithLMHead.from_pretrained(args.model_select, from_pt=True) train_dataset = get_dataset(encoded["train"], tag="train") val_dataset = None if "validation" in encoded: val_dataset = get_dataset(encoded["validation"], tag="eval") trainer = T2TTrainer(args, logger) trainer.train(model, strategy, tokenizer, train_dataset=train_dataset, eval_dataset=val_dataset, evaluate_fn=evaluate, verbose=True) # we want the testing is independent of the training as much as possible # so that it is okay to do test when args.do_train = False and checkpoints already exist if args.do_test: test_set = "test" if test_set in encoded: add_filehandler_for_logger(args.output_path, logger, out_name="test") sorted_indices, index2path = get_existing_cks(args.output_path, return_best_ck=False) if args.ck_index_select < 0:
def create_mlm_model_and_optimizer(): with strategy.scope(): model = TFAutoModelWithLMHead.from_pretrained(PRETRAINED_MODEL) optimizer = tf.keras.optimizers.Adam(learning_rate=LR) return model, optimizer
sentiment_analysis_model = pipeline('sentiment-analysis') text_generation_model = pipeline('text-generation') def sentiment_analysis(text: str): result = sentiment_analysis_model(text) return result def text_generation(text: str): result = text_generation_model(text, max_length=80, do_sample=False) return result t5_model = TFAutoModelWithLMHead.from_pretrained("t5-large", return_dict=True) t5_tokenizer = AutoTokenizer.from_pretrained("t5-large") def summarize(text: str): # T5 uses a max_length of 512 so we cut the article to 512 tokens. inputs = t5_tokenizer.encode("summarize: " + text, return_tensors="tf", max_length=512) outputs = t5_model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True) result = t5_tokenizer.decode(
# remodeling the model and saving the model as tensorflow (tf_model.h5) # create folder to save converted models import os os.mkdir("./model/pb_model") os.mkdir("./model/tf_model/keras") # loading hugging face converter as described here: # https://huggingface.co/transformers/model_sharing.html from transformers import TFAutoModelWithLMHead import tensorflow as tf # load pytorch_model.bin and related model structures, convert to h5 tf_model = TFAutoModelWithLMHead.from_pretrained("./model/trained_model/", from_pt=True) # and save converted tf_model.h5 in "tf_model" tf_model.save_pretrained("./model/tf_model/") # and save "saved_model.pb" in "pb_model" tf_model.save("./model/pb_model/") tf.saved_model.save(german_model, "./model/tf_model/keras") # loading the h5 model is not a problem with TFAutoModelWithLMHead loaded = tf.saved_model.load("./model/tf_model/keras") #tokenizer = AutoTokenizer.from_pretrained("anonymous-german-nlp/german-gpt2") prompt = "Ada liebte ihre Katze" inputs = tokenizer.encode(prompt, add_special_tokens=False,
def test_from_identifier_from_model_type(self): logging.basicConfig(level=logging.INFO) model = TFAutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER) self.assertIsInstance(model, TFRobertaForMaskedLM) self.assertEqual(model.num_parameters(), 14830) self.assertEqual(model.num_parameters(only_trainable=True), 14830)
from aliyunsdkcore.client import AcsClient from aliyunsdkcore.acs_exception.exceptions import ClientException from aliyunsdkcore.acs_exception.exceptions import ServerException from aliyunsdkalimt.request.v20181012.TranslateGeneralRequest import TranslateGeneralRequest logger = logging.getLogger() handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) logger.info("Loading model ...") tokenizer = AutoTokenizer.from_pretrained( "/home/admin/workspace/model/transformers/bert-base-multilingual-cased") model = TFAutoModelWithLMHead.from_pretrained( "/home/admin/workspace/model/transformers/bert-base-multilingual-cased") config = configparser.ConfigParser() config.read("/home/admin/workspace/.secret") client = AcsClient(config["account xjx"]["access_key"], config["account xjx"]["access_secret"], 'cn-hangzhou') def cut_sentences(text, min_len=3): """ Cut sentences by their length and punctuation, remove all spaces. """ text = text.replace(" ", "") corpus = re.split(r"[\,\.\?,。?\n]", text) corpus = list(filter(lambda x: len(x) >= min_len, corpus))
reader = FARMReader(model_name_or_path=os.path.join( dir_path, 'SaveAlbert'), use_gpu=False, num_processes=1) elif selector == '2': reader = FARMReader(model_name_or_path=os.path.join(dir_path, 'SaveBERT'), use_gpu=False, num_processes=1) elif selector == '3': reader = FARMReader(model_name_or_path="./SaveXLNet", use_gpu=False, num_processes=1) nlp = en_coref_md.load() model_sum = TFAutoModelWithLMHead.from_pretrained(os.path.join( dir_path, 'summarizer'), return_dict=True) tokenizer_sum = AutoTokenizer.from_pretrained( os.path.join(dir_path, 'summarizer')) classifier = pipeline("zero-shot-classification", model=os.path.join(dir_path, 'zero-shot-classifier')) document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="rethinkl_test1") retriever = DensePassageRetriever(document_store=document_store) tokenizer_converse = AutoTokenizer.from_pretrained( os.path.join(dir_path, 'DialoGPT-large')) model_converse = AutoModelForCausalLM.from_pretrained( os.path.join(dir_path, 'DialoGPT-large')) finder = Finder(reader, retriever)