def ingest(): """ Every model from HugginFace is applicable TODO: put url here Corpus example: squad | MedQA or FindZebra """ typer.secho("Welcome to the ingest command", fg=typer.colors.WHITE, bold=True) model = BertModel.from_pretrained(Config['model'].get()) fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(Config['tokenizer'].get()) # fast_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) corpus = load_dataset(Config['corpus'].get(), split='train[:100]') # cache_dir=Config['cache_dir'].get() -- Cache directory override torch.set_grad_enabled(False) typer.secho("Embedding corpus as dense context vector representation using FAISS.") corpus_embeddings = corpus.map( lambda example: { 'embeddings': model(**fast_tokenizer(example['line'], return_tensors='pt'))['pooler_output'][0].numpy()}) # corpus_embeddings.save_to_disk(os.path.join(Config['cache_dir'].get(), "corpus/")) typer.secho("Adding FAISS index for efficient similarity search and clustering of dense vectors.") corpus_embeddings.add_faiss_index(column='embeddings') typer.secho("Saving the index") corpus_embeddings.save_faiss_index("embeddings", "corpus.faiss") # os.path.join(Config['cache_dir'].get()) return 0
def setUp(self): self.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map super().setUp() self.test_rust_tokenizer = True self.tokenizers_list = [(PreTrainedTokenizerFast, "robot-test/dummy-tokenizer-fast", {})] tokenizer = PreTrainedTokenizerFast.from_pretrained("robot-test/dummy-tokenizer-fast") tokenizer.save_pretrained(self.tmpdirname)
def __init__(self, bot): self.bot = bot self.model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2') self.tokenizer = tokenizer = PreTrainedTokenizerFast.from_pretrained( "skt/kogpt2-base-v2", bos_token='</s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
def get_kobart_tokenizer(): tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart") tokenizer.pad_token = "<pad>" tokenizer.bos_token = "<s>" tokenizer.eos_token = "</s>" tokenizer.unk_token = "<unk>" tokenizer.mask_token = "<mask>" return tokenizer
def __init__(self, model: str, device: str): config = BartConfig.from_pretrained("hyunwoongko/kobart") self.model = BartForConditionalGeneration(config).half().eval().to( device) self.model.model.load_state_dict(torch.load( model, map_location=device, )) self.tokenizer = PreTrainedTokenizerFast.from_pretrained( "hyunwoongko/kobart") self.device = device
def setUp(self): self.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map super().setUp() self.test_rust_tokenizer = True model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"] # Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment) self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths] tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0]) tokenizer.save_pretrained(self.tmpdirname)
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart") model = BartForConditionalGeneration.from_pretrained( args.finetuned_model_path) model.eval() model.to(device) examples = [ "배고프다", "너무너무 사랑해요", "나는 너를 좋아해", "저의 취미는 축구입니다", "어제 무슨 영화 봤어?", "짜장면 짬뽕 탕수육 먹었어" ] for example in examples: chosung_example = convert_text_to_chosung(example) input_ids = (torch.tensor( tokenizer.convert_tokens_to_ids( tokenizer.tokenize(chosung_example))).unsqueeze(0).to(device)) if args.decoding_method == "top_p": outputs = model.generate( input_ids=input_ids, max_length=48, temperature=1.0, do_sample=True, top_p=0.8, pad_token_id=tokenizer.pad_token_id, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, decoder_start_token_id=tokenizer.bos_token_id, num_return_sequences=5, ) elif args.decoding_method == "beam_search": outputs = model.generate( input_ids=input_ids, max_length=48, num_beams=10, pad_token_id=tokenizer.pad_token_id, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, decoder_start_token_id=tokenizer.bos_token_id, num_return_sequences=5, ) else: raise ValueError( "Enter the right decoding method (top_p or beam_search)") for output in outputs.tolist(): answer = tokenizer.decode(output) print(f"초성: {chosung_example} \t 예측 문장: {answer}")
def __init__(self, datapath, max_seq_len=128): self.datapath = datapath self.data = pd.read_csv(self.datapath, sep='\t') self.bos_token = '</s>' self.eos_token = '</s>' self.max_seq_len = max_seq_len self.tokenizer = PreTrainedTokenizerFast.from_pretrained( "skt/kogpt2-base-v2", bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
def test_async_share_tokenizer(self): # See https://github.com/huggingface/transformers/pull/12550 # and https://github.com/huggingface/tokenizers/issues/537 tokenizer = PreTrainedTokenizerFast.from_pretrained( "robot-test/dummy-tokenizer-wordlevel") text = "The Matrix is a 1999 science fiction action film." with concurrent.futures.ThreadPoolExecutor() as executor: futures = [ executor.submit(self.fetch, tokenizer, text) for i in range(10) ] return_value = [future.result() for future in futures] self.assertEqual(return_value, [[1, 10, 0, 8, 0, 18, 0, 0, 0, 2] for i in range(10)])
def __init__(self, type="normal", device="cpu"): """ Constructor of Summarizers Args: type (str): type of article. (e.g. normal, paper, patent) device (str): device for inference (e.g. cpu, cuda) """ type = type.lower() model_name_prefix = "hyunwoongko/ctrlsum" assert type in ['normal', 'paper', 'patent'], \ "param `article_type` must be one of ['normal', 'paper', 'patent']" if type == "normal": model_name = f"{model_name_prefix}-cnndm" elif type == "paper": model_name = f"{model_name_prefix}-paper" elif type == "patent": model_name = f"{model_name_prefix}-patent" else: raise Exception(f"Unknown type: {type}") self.device = device self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to( device) self.tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name) self._5w1h = [ "what ", "what's " "when ", "why ", "who ", "who's ", "where ", "how ", "What ", "What's ", "When ", "Why ", "Who ", "Who's ", "Where ", "How ", ]
def __init__(self, path, max_ids): self.model = load_model(path) self.max_ids = max_ids U_TKN = '<usr>' S_TKN = '<sys>' BOS = '</s>' EOS = '</s>' MASK = '<unused0>' SENT = '<unused1>' PAD = '<pad>' TOKENIZER = PreTrainedTokenizerFast.from_pretrained( "skt/kogpt2-base-v2", bos_token=BOS, eos_token=EOS, unk_token='<unk>', pad_token=PAD, mask_token=MASK) self.tok = TOKENIZER
def fine_tuning(MODEL_TYPE, DATA_PATH, BATCH_SIZE, LEARNING_RATE, WARMUP_STEPS, OUTPUT_MODEL_PATH, EPOCHS): print("=" * 15, "LOAD MODEL", "=" * 15) model = GPT2LMHeadModel.from_pretrained(MODEL_TYPE) tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_TYPE) print("=" * 15, "GET DATASET", "=" * 15) data_loader = get_data_loader(DATA_PATH, tokenizer, BATCH_SIZE, True) optimizier = AdamW(model.parameters(), lr=LEARNING_RATE) scheduler = get_linear_schedule_with_warmup( optimizier, WARMUP_STEPS, len(data_loader) - WARMUP_STEPS, -1) if not os.path.exists(OUTPUT_MODEL_PATH): os.mkdir(OUTPUT_MODEL_PATH) fine_tuning_runner(model, optimizier, data_loader, scheduler, EPOCHS, OUTPUT_MODEL_PATH) model.save_pretrained(OUTPUT_MODEL_PATH)
def summarizer(input: TextSummerizeInput) -> TextSummerizeOutput: """ Summarize texts """ tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart") inputs = tokenizer([ tokenizer.bos_token + input.text_input + tokenizer.eos_token ])['input_ids'][0] model_url = 'https://train-mxysk1opgrzauh8ifw55-gpt2-train-teachable-ainize.endpoint.dev.ainize.ai/predictions/bart-ko-small-finetune' headers = {'Content-Type': 'application/json; charset=utf-8'} response = requests.post(url=model_url, headers=headers, json={"text": inputs}) if response.status_code == 200: result = tokenizer.decode(response.json()[0], skip_special_tokens=True) return TextSummerizeOutput(output=result) else: print(f'Failed {response.text}') return TextSummerizeOutput(output='Failed summerize')
def main(): # Config config = TrainConfig() # Logger logger = logging.getLogger() logger.setLevel(logging.INFO) handler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter("[%(asctime)s] %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Data Loading... raw_train_instances = load_data(config.train_file_path) raw_dev_instances = load_data(config.dev_file_path) logger.info(f"훈련용 예시 개수:{len(raw_train_instances)}\t 검증용 예시 개수:{len(raw_dev_instances)}") tokenizer = PreTrainedTokenizerFast.from_pretrained(config.pretrained_model_name) train_dataset = ChosungTranslatorDataset(raw_train_instances, tokenizer, config.max_seq_len) dev_dataset = ChosungTranslatorDataset(raw_dev_instances, tokenizer, config.max_seq_len) train_dataloader = DataLoader( train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, ) dev_dataloader = DataLoader( dev_dataset, batch_size=config.batch_size, num_workers=config.num_workers, ) model = BartForConditionalGeneration.from_pretrained(config.pretrained_model_name) # Train optimizer = Adam(model.parameters(), lr=config.learning_rate) train(config, model, train_dataloader, dev_dataloader, optimizer, logger, device)
def get_kogpt2_tokenizer(model_path=None): if not model_path: model_path = 'taeminlee/kogpt2' tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path) return tokenizer
def get_kobart_tokenizer(): return PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart")
nlpbook.set_logger(args) # %% download corpus from Korpora import Korpora Korpora.fetch( args.downstream_corpus_name, root_dir=args.downstream_corpus_root_dir, force_download=args.force_download, ) # %% prepare tokenizer from transformers import PreTrainedTokenizerFast tokenizer = PreTrainedTokenizerFast.from_pretrained( args.pretrained_model_name, eos_token="</s>", ) # %% create train dataset from torch.utils.data import DataLoader, SequentialSampler, RandomSampler from ratsnlp.nlpbook.generation import GenerationDataset, NsmcCorpus corpus = NsmcCorpus() train_dataset = GenerationDataset( args=args, corpus=corpus, tokenizer=tokenizer, mode="train" )
# pip install opyrator transformers torch from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel import torch from pydantic import BaseModel, Field import os os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2", bos_token='</s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<mask>') model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2') class Input(BaseModel): text: str = Field(title='문장을 입력해주세요.', max_length=128) max_length: int = Field(128, ge=5, le=128) repetition_penalty: float = Field(2.0, ge=0.0, le=2.0) class Output(BaseModel): generated_text: str def generate_text(input: Input) -> Output:
from torch import nn from torch.nn import functional as F from torch.utils.data import Dataset, DataLoader from transformers import BertTokenizer, BertModel, BertConfig, PreTrainedTokenizerFast from head import GlobalPointer, MutiHeadSelection, Biaffine, TxMutihead import sys import os head_type = sys.argv[1] os.environ["CUDA_VISIBLE_DEVICES"] = sys.argv[2] device = torch.device('cuda') if torch.cuda.is_available() else torch.device( 'cpu') print("Using {} device".format(device)) model_path = "../model_set/bert-base-chinese" tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path) tokenizer.add_special_tokens({'pad_token': '[PAD]'}) assert head_type in [ 'GlobalPointer', 'MutiHeadSelection', 'Biaffine', 'TxMutihead' ] if head_type in ['MutiHeadSelection', 'Biaffine', 'TxMutihead']: batch_size = 4 learning_rate = 1e-5 abPosition = False rePosition = True else: batch_size = 16 learning_rate = 2e-5
from flask import Flask, request, Response, render_template, jsonify import requests import time import random import json import os from transformers import PreTrainedTokenizerFast eng_tokenizer = PreTrainedTokenizerFast.from_pretrained("gpt2-large") # Server & Handling Setting app = Flask(__name__, static_url_path='/static') models = { "gpt2-large": "gpt2-large", "gpt2-cover-letter": "cover-letter-gpt2", "gpt2-story": "gpt2_story", "gpt2-reddit": "gpt2_reddit", "gpt2-trump": "gpt2_trump" } SERVER_URL = os.environ.get('GPT2_SERVER_URL') AINIZE_STATUS_URL = os.environ.get('AINIZE_STATUS_URL') API_DEV = os.environ.get('API_DEV') API_STAGING = os.environ.get('API_STAGING') API_PROD = os.environ.get('API_PROD') @app.route("/status", methods=['GET']) def ainize_status(): try:
from transformers import PreTrainedTokenizerFast, RobertaForMaskedLM from preprocessing.evaluator import Evaluator # Check that PyTorch sees it USE_GPU = torch.cuda.is_available() # USE_GPU = False print(f'USE_GPU={USE_GPU}') run_path = Path('runs') / 'run_4' model_path = run_path / 'model' dataset_path = Path('data') / 'pan_tadeusz' text_tokenizer = TextTokenizer(dataset_path) text_tokenizer.load_vocab(dataset_path / 'vocab.json') tokenizer2 = PreTrainedTokenizerFast.from_pretrained( dataset_path / 'my-pretrained-tokenizer-fast2', max_len=128) # 4. Check that the LM actually trained def to_gpu(x, *args, **kwargs): return x.cuda(*args, **kwargs) if USE_GPU else x # load trained model # os.system('tar xzvf PanTadeuszRoBERTa.tgz') model = RobertaForMaskedLM.from_pretrained(str(model_path)) model = to_gpu(model) model.device
from tqdm import trange import os import sys import json import urllib.request from transformers import PreTrainedTokenizerFast from transformers import GPT2LMHeadModel MODEL_NAME = "skt/kogpt2-base-v2" MODEL_PATH = "./models/" SEQ_LEN = 50 TOKENS_DICT = { "additional_special_tokens": ["<unused0>", "<unused1>"], } tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_NAME) model = GPT2LMHeadModel.from_pretrained(MODEL_NAME) tokenizer.add_special_tokens(TOKENS_DICT) device = torch.device('cpu') model.load_state_dict( torch.load("smithy/models/processed_slogan_final_5epoch_model.pth", map_location=device)) model.eval() import torch import torch.nn.functional as F from tqdm import trange def top_k_top_p_filtering(logits,
import torch from flask import Flask, make_response from flask_restx import Api, Resource, reqparse from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast app = Flask(__name__) api = Api(app) DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print("MODEL LOADING ...") MODEL = GPT2LMHeadModel.from_pretrained("./models") MODEL.to(DEVICE) print("TOKENIZER LOADING ...") TOKENIZER = PreTrainedTokenizerFast.from_pretrained("taeminlee/kogpt2") MODEL.eval() def generate(text=""): if text == "": return "error!! :(" input_ids = text + "</s>" tokens = TOKENIZER.encode(input_ids, return_tensors='pt').to(DEVICE) min_length = len(tokens) output_ids = TOKENIZER.decode(MODEL.generate(tokens, do_sample=True, max_length=50, min_length=min_length, top_k=50,
def __init__(self, pretrained): self.tokenizer = PTTF.from_pretrained(pretrained, mask_token='[MASK]') self.model = AutoModelForMaskedLM.from_pretrained(pretrained) self.model.eval()
parser.add_argument('--bucket', type=str, default='NONE') logger = logging.getLogger() logger.setLevel(logging.INFO) BOS = '<s>' EOS = '</s>' MASK = '<mask>' NEWLINE = '<unused0>' PAD = '<pad>' TOKENIZER = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2", bos_token=BOS, eos_token=EOS, unk_token='<unk>', pad_token=PAD, mask_token=MASK) class CommentDataset(Dataset): def __init__(self, comments, max_len=32): self._data = comments self.bos = BOS self.eos = EOS self.mask = MASK self.pad = PAD self.max_len = max_len self.tokenizer = TOKENIZER temp = [] for x in self._data: