def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "bert" in self.config.n_model: from transformers import RobertaTokenizer model_path = download_or_load( f"bert/{self.config.n_model}", self.config.lang, ) config = torch.load(f"{model_path}/config.pt") tokenizer = RobertaTokenizer.from_pretrained("roberta-base") model = RobertaEncoder.from_pretrained( device=device, model_path=model_path, tokenizer=tokenizer, config=config, ).eval().to(device) return PororoBertMovie(model, self.config)
def load(self, device: int): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if self.config.n_model == "koparadigm": try: from koparadigm import Paradigm except ModuleNotFoundError as error: raise error.__class__( "Please install koparadigm with: `pip install koparadigm`") model = Paradigm() return PororoKoParadigm(model, self.config) if self.config.n_model in ["enparadigm", "japaradigm"]: model_path = download_or_load( f"misc/inflection.{self.config.lang}.pickle", self.config.lang, ) with open(model_path, "rb") as handle: model = dict(pickle.load(handle)) return PororoParadigm(model, self.config)
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if self.config.n_model == "p2g.zh": from pororo.models.p2g import P2gM pinyin = download_or_load( f"misc/pinyin2idx.{self.config.lang}.pkl", self.config.lang, ) char = download_or_load( f"misc/char2idx.{self.config.lang}.pkl", self.config.lang, ) ckpt = download_or_load( f"misc/{self.config.n_model}.pt", self.config.lang, ) model = P2gM(pinyin, char, ckpt, device) return PororoP2GZh(model, self.config) if self.config.n_model == "p2g.ja": from fairseq.models.transformer import TransformerModel load_dict = download_or_load( "transformer/transformer.base.ja.p2g", self.config.lang, ) model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file="transformer.base.ja.p2g.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) return PororoP2GJa(model, self.config)
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "brainbert" in self.config.n_model: from pororo.models.brainbert import BrainRobertaModel model = (BrainRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertSts(model, self.config) if "jaberta" in self.config.n_model: from pororo.models.brainbert import JabertaModel model = (JabertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertSts(model, self.config) if "zhberta" in self.config.n_model: from pororo.models.brainbert import ZhbertaModel model = (ZhbertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertSts(model, self.config) if "sbert" in self.config.n_model: from sentence_transformers import SentenceTransformer path = download_or_load( f"sbert/{self.config.n_model}", self.config.lang, ) model = SentenceTransformer(path).eval().to(device) return PororoSBertSts(model, self.config) if "roberta" in self.config.n_model: from pororo.models.brainbert import CustomRobertaModel model = (CustomRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertSts(model, self.config)
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ from pororo.tasks import PororoTokenizationFactory sent_tokenizer = (lambda text, lang: PororoTokenizationFactory( task="tokenization", lang=lang, model=f"sent_{lang}", ).load(device).predict(text)) if "multi" in self.config.n_model: from fairseq.models.transformer import TransformerModel from pororo.tasks.utils.tokenizer import CustomTokenizer load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) if "mtpg" in self.config.n_model: langtok_style = "mbart" elif "m2m" in self.config.n_model: langtok_style = "multilingual" else: langtok_style = "basic" return PororoTransformerTransMulti( model, self.config, tokenizer, sent_tokenizer, langtok_style, )
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "charbert" in self.config.n_model: from pororo.models.brainbert import CharBrainRobertaModel model = (CharBrainRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) print( "As of now, this beta model tries to correct spacing errors in Korean text." ) return PororoBertSpacing(model, self.config) if "transformer" in self.config.n_model: from fairseq.models.transformer import TransformerModel from pororo.tasks.utils.tokenizer import CustomTokenizer load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) tokenizer = None model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) if "char" in self.config.n_model: return PororoTransformerGecChar(model, self.config) if load_dict.src_tok: tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) return PororoTransformerGec(model, tokenizer, device, self.config)
def load(self, device: str): from sentence_transformers import SentenceTransformer model_path = self.config.n_model if self.config.lang != "en": model_path = download_or_load( f"sbert/{self.config.n_model}", self.config.lang, ) model = SentenceTransformer(model_path).eval().to(device) return PororoSBertSentence(model, self.config)
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "transformer" in self.config.n_model: from fairseq.models.transformer import TransformerModel from pororo.tasks import PororoPosFactory load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) if self.config.lang == "ko": tagger = PororoPosFactory( task="pos", model="mecab-ko", lang=self.config.lang, ).load(device) return PororoTransConstKo(model, tagger, self.config) if self.config.lang == "en": tagger = PororoPosFactory( task="pos", model="nltk", lang=self.config.lang, ).load(device) return PororoTransConstEn(model, tagger, self.config) if self.config.lang == "zh": tagger = PororoPosFactory( task="pos", model="jieba", lang=self.config.lang, ).load(device) return PororoTransConstZh(model, tagger, self.config)
def load_model(cls, model_name: str, lang: str, **kwargs): """ Load pre-trained model as RobertaHubInterface. :param model_name: model name from available_models :return: pre-trained model """ from fairseq import hub_utils ckpt_dir = download_or_load(model_name, lang) tok_path = download_or_load(f"tokenizers/bpe32k.{lang}.zip", lang) x = hub_utils.from_pretrained( ckpt_dir, "model.pt", ckpt_dir, load_checkpoint_heads=True, **kwargs, ) return BrainRobertaHubInterface( x["args"], x["task"], x["models"][0], tok_path, )
def load_model(cls, model_name: str, lang: str, **kwargs): """ Load pre-trained model as RobertaHubInterface. :param model_name: model name from available_models :return: pre-trained model """ from fairseq import hub_utils ckpt_dir = download_or_load(model_name, lang) x = hub_utils.from_pretrained( ckpt_dir, "model.pt", bpe="gpt2", load_checkpoint_heads=True, **kwargs, ) return CustomRobertaHubInterface(x["args"], x["task"], x["models"][0])
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ from sentence_transformers import SentenceTransformer model_path = self.config.n_model if self.config.lang != "en": model_path = download_or_load( f"sbert/{self.config.n_model}", self.config.lang, ) model = SentenceTransformer(model_path).eval().to(device) return PororoSBertSentence(model, self.config)
def load_model(cls, model_name: str, lang: str, **kwargs): """ Load pre-trained model as RobertaHubInterface. :param model_name: model name from available_models :return: pre-trained model """ from fairseq import hub_utils # cache directory is treated as the home directory for both model and data files ckpt_dir = download_or_load(model_name, lang) x = hub_utils.from_pretrained( ckpt_dir, "model.pt", ckpt_dir, load_checkpoint_heads=True, **kwargs, ) return SegmentBertHubInterface( x["args"], x["task"], x["models"][0], lang, )
def __init__(self, args, task, model): args.gpt2_encoder_json = download_or_load("misc/encoder.json", "en") args.gpt2_vocab_bpe = download_or_load("misc/vocab.bpe", "en") super().__init__(args, task, model) self.softmax = nn.Softmax(dim=1)
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "bart" in self.config.n_model: from whoosh import index from pororo.models.bart.KoBART import KoBartModel from pororo.models.wikipedia2vec import Wikipedia2Vec from pororo.tasks import PororoTokenizationFactory vec_map = { "ko": "kowiki_20200720_100d.pkl", "en": "enwiki_20180420_100d.pkl", "ja": "jawiki_20180420_100d.pkl", "zh": "zhwiki_20180420_100d.pkl", } f_wikipedia2vec = download_or_load( f"misc/{vec_map[self.config.lang]}", self.config.lang, ) f_index = download_or_load( f"misc/{self.config.lang}_indexdir.zip", self.config.lang, ) model = Wikipedia2Vec(model_file=f_wikipedia2vec, device=device) idx = index.open_dir(f_index) sim_words = SimilarWords(model, idx) model_path = download_or_load( f"bart/{self.config.n_model}", self.config.lang, ) model = KoBartModel.from_pretrained( device=device, model_path=model_path, ) sent_tok = (lambda text: PororoTokenizationFactory( task="tokenization", lang=self.config.lang, model=f"sent_{self.config.lang}", ).load(device).predict(text)) return PororoKoBartQuestionGeneration( model, sim_words, sent_tok, self.config, )
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "sent" in self.config.n_model: import nltk try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt") from nltk.tokenize import sent_tokenize return PororoSentTokenizer(sent_tokenize, self.config) if self.config.n_model == "mecab_ko": try: import mecab except ModuleNotFoundError as error: raise error.__class__( "Please install python-mecab-ko with: `pip install python-mecab-ko`" ) model = mecab.MeCab() return PororoMecabKoTokenizer(model, self.config) if self.config.n_model == "char": return PororoCharTokenizer(self.config) if self.config.n_model == "jamo": return PororoJamoTokenizer(self.config) if self.config.n_model == "word": return PororoWordTokenizer(self.config) if self.config.n_model == "roberta": from fairseq.data.encoders.gpt2_bpe import get_encoder encoder = download_or_load("misc/encoder.json", self.config.lang) vocab = download_or_load("misc/vocab.bpe", self.config.lang) model = get_encoder(encoder, vocab) with open(encoder, "r") as f_vocab: vocab = json.load(f_vocab) inv_dict = {v: k for k, v in vocab.items()} return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config) if self.config.n_model == "moses": try: from sacremoses import MosesDetokenizer, MosesTokenizer except ModuleNotFoundError as error: raise error.__class__( "Please install sacremoses with: `pip install sacremoses`") model = MosesTokenizer(lang="en") detok = MosesDetokenizer(lang="en") return PororoMosesTokenizer(model, detok, self.config) if self.config.n_model == "jieba": try: import jieba except ModuleNotFoundError as error: raise error.__class__( "Please install jieba with: `pip install jieba`") model = jieba.cut return PororoJiebaTokenizer(model, self.config) if self.config.n_model == "mecab": try: import fugashi except ModuleNotFoundError as error: raise error.__class__( "Please install fugashi with: `pip install fugashi`") try: import ipadic except ModuleNotFoundError as error: raise error.__class__( "Please install ipadic with: `pip install ipadic`") dic_dir = ipadic.DICDIR mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = "-d {} -r {} ".format( dic_dir, mecabrc, ) model = fugashi.GenericTagger(mecab_option) return PororoMecabTokenizer(model, self.config) else: from pororo.tasks.utils.tokenizer import CustomTokenizer path = download_or_load( f"tokenizers/{self.config.n_model}.zip", self.config.lang, ) ext = "json" if "unigram" not in self.config.n_model else "txt" merges_filename = (f"{path}/merges.txt" if "unigram" not in self.config.n_model else None) model = CustomTokenizer.from_file( vocab_filename=f"{path}/vocab.{ext}", merges_filename=merges_filename, normalize=True if "jpe" not in self.config.n_model else False, ) if "jpe" in self.config.n_model: return PororoJamoPairTokenizer(model, self.config) if "mecab.bpe" in self.config.n_model: return PororoMecabSPTokenizer(model, self.config) return PororoSPTokenizer(model, self.config)
from fairseq import hub_utils from fairseq.models.roberta import RobertaHubInterface, RobertaModel import mecab from pororo.models.brainbert.BrainRoBERTa import BrainRobertaHubInterface from pororo.tasks.machine_reading_comprehension import PororoBertMrc from pororo.tasks.utils.base import TaskConfig from pororo.tasks.utils.download_utils import download_or_load from pororo.tasks.utils.tokenizer import CustomTokenizer from pororo.utils import postprocess_span import torch ckpt_dir = download_or_load("bert/brainbert.base.ko.korquad", "ko") tok_path = download_or_load(f"tokenizers/bpe32k.ko.zip", "ko") x = hub_utils.from_pretrained( ckpt_dir, "model.pt", ckpt_dir, load_checkpoint_heads=True ) model = BrainRobertaHubInterface( x["args"], x["task"], x["models"][0], tok_path, ).to(torch.device("cuda")) tagger = mecab.MeCab() final = PororoBertMrc(model, tagger, postprocess_span, TaskConfig("mrc", "ko", "brainbert.base.ko.korquad")) print(final("이름이 뭐야?", "이름은 시리야."))
def _apply_wsd(self, tags: List[Tuple[str, str]]): """ Apply Word Sense Disambiguation to get detail tag info Args: tags (List[Tuple[str, str]]): inference word-tag pair result Returns: List[Tuple[str, str]]: wsd-applied result """ if self._wsd is None: from pororo.tasks import PororoWsdFactory self._wsd = PororoWsdFactory( task="wsd", lang="ko", model="transformer.large.ko.wsd", ).load(self._device) if self._cls2cat is None: self._cls2cat = dict() lines = (open( download_or_load( "misc/wsd.cls.txt", self.config.lang, ), "r", encoding="utf8", ).read().strip().splitlines()) for line in lines: morph, homonymno, category = line.split() classifier = f"{morph}__NNB__{homonymno}" # bound noun self._cls2cat[classifier] = category if self._quant2cat is None: self._quant2cat = dict() self._term2cat = dict() lines = (open( download_or_load( "misc/re.templates.txt", self.config.lang, ), "r", ).read().strip().splitlines()) for line in lines: category, ner_category, expression = line.split(" ", 2) if ner_category == "QUANTITY": self._quant2cat[expression] = category elif ner_category == "TERM": self._term2cat[expression] = category input_text_with_markers = str() target_token_ids = [] for idx, ner_token in enumerate(tags): surface, tag = ner_token # as {} will be used as special symbols surface = surface.replace("{", "{") surface = surface.replace("}", "}") if tag == "TERM": cat = self._template_match(surface, self._term2cat) if cat is not None: tags[idx] = (surface, cat) input_text_with_markers += surface elif tag == "QUANTITY": cat = self._template_match(surface, self._quant2cat) if cat is not None: tags[idx] = (surface, cat) input_text_with_markers += surface else: target_token_ids.append(idx) input_text_with_markers += "{" + surface + "}" else: input_text_with_markers += surface wsd_results = self._wsd(input_text_with_markers) action = False has_category = False categories = [] for wsd_token in wsd_results: morph, tag, homonymno = wsd_token[:3] if morph == "{": has_category = False action = True elif morph == "}": if has_category is False: categories.append("QUANTITY") # original category has_category = False action = False if action: if homonymno is None: homonymno = "00" query = f"{morph}__{tag}__{homonymno}" if query in self._cls2cat: category = self._cls2cat[query] categories.append(category) has_category = True action = False assert len(target_token_ids) == len(categories) for target_token_id, cat in zip(target_token_ids, categories): tags[target_token_id] = (tags[target_token_id][0], cat) return tags
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ translator = None if "transformer" in self.config.n_model: from transformers import BertTokenizer from pororo.models.caption import Caption, Detr load_dict = download_or_load( f"transformer/{self.config.n_model}", "en", ) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") pad_token_id = tokenizer.pad_token_id vocab_size = tokenizer.vocab_size transformer = Caption(pad_token_id, vocab_size) transformer.load_state_dict( torch.load( os.path.join( load_dict.path, f"{self.config.n_model}.pt", ), map_location=device, )["model"]) transformer.eval().to(device) detr = Detr(device) if self.config.lang != "en": assert self.config.lang in [ "ko", "ja", "zh", ], "Unsupported language code is selected!" from pororo.tasks import PororoTranslationFactory translator = PororoTranslationFactory( task="mt", lang="multi", model="transformer.large.multi.mtpg", ) translator = translator.load(device) return PororoCaptionBrainCaption( detr, transformer, tokenizer, translator, device, self.config, )
def load(self, device): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "roberta" in self.config.n_model: from pororo.models.brainbert import CustomRobertaModel model = (CustomRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertNerEn(model, self.config) if "charbert" in self.config.n_model: from pororo.models.brainbert import CharBrainRobertaModel from pororo.tasks.tokenization import PororoTokenizationFactory model = (CharBrainRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) sent_tokenizer = PororoTokenizationFactory( task="tokenization", model="sent_ko", lang=self.config.lang, ).load(device) f_wsd_dict = open( download_or_load( f"misc/wiki.{self.config.lang}.items", self.config.lang, ), "r", ) wsd_dict = defaultdict(dict) for line in f_wsd_dict.readlines(): origin, target, word = line.strip().split("\t") wsd_dict[origin][word] = target return PororoBertCharNer( model, sent_tokenizer, wsd_dict, device, self.config, ) if "zhberta" in self.config.n_model: from pororo.models.brainbert import ZhbertaModel model = (ZhbertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertNerZh(model, self.config) if "jaberta" in self.config.n_model: from pororo.models.brainbert import JabertaModel model = (JabertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertNerJa(model, self.config)
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "multi" in self.config.n_model: from fairseq.models.transformer import TransformerModel from pororo.tasks.utils.tokenizer import CustomTokenizer load_dict = download_or_load( f"transformer/{self.config.n_model}", "multi", ) model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) return PororoTransformerTransMulti( model, self.config, tokenizer, ) if "transformer" in self.config.n_model: from fairseq.models.transformer import TransformerModel load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) tokenizer = None model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) if self.config.lang != "zh": from pororo.tasks.utils.tokenizer import CustomTokenizer tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) return PororoTransformerParaphrase(model, self.config, tokenizer)