def __init__( self, vocab_file, tokenizer_path, bos_token="<s>", eos_token="</s>", unk_token="<unk>", pad_token="<pad>", mask_token="<mask>", **kwargs, ): super().__init__() self.bos_token = bos_token self.eos_token = eos_token self.unk_token = unk_token self.pad_token = pad_token self.mask_token = mask_token self.tokenizer = CustomTokenizer.from_file( vocab_filename=f"{tokenizer_path}/vocab.json", merges_filename=f"{tokenizer_path}/merges.txt", ) brain_tokens = [bos_token, pad_token, eos_token, unk_token] with open(vocab_file, "r") as r: brain_tokens.extend( [line.strip().split()[0] for line in r.readlines()][3:]) brain_tokens.append(mask_token) self.brain_tok2idx = {tok: idx for idx, tok in enumerate(brain_tokens)} self.brain_idx2tok = {idx: tok for idx, tok in enumerate(brain_tokens)}
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ from pororo.tasks import PororoTokenizationFactory sent_tokenizer = (lambda text, lang: PororoTokenizationFactory( task="tokenization", lang=lang, model=f"sent_{lang}", ).load(device).predict(text)) if "multi" in self.config.n_model: from fairseq.models.transformer import TransformerModel from pororo.tasks.utils.tokenizer import CustomTokenizer load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) if "mtpg" in self.config.n_model: langtok_style = "mbart" elif "m2m" in self.config.n_model: langtok_style = "multilingual" else: langtok_style = "basic" return PororoTransformerTransMulti( model, self.config, tokenizer, sent_tokenizer, langtok_style, )
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "charbert" in self.config.n_model: from pororo.models.brainbert import CharBrainRobertaModel model = (CharBrainRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) print( "As of now, this beta model tries to correct spacing errors in Korean text." ) return PororoBertSpacing(model, self.config) if "transformer" in self.config.n_model: from fairseq.models.transformer import TransformerModel from pororo.tasks.utils.tokenizer import CustomTokenizer load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) tokenizer = None model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) if "char" in self.config.n_model: return PororoTransformerGecChar(model, self.config) if load_dict.src_tok: tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) return PororoTransformerGec(model, tokenizer, device, self.config)
def __init__(self, args, task, model, tok_path): super().__init__(args, task, model) self.bpe = CustomTokenizer.from_file( vocab_filename=f"{tok_path}/vocab.json", merges_filename=f"{tok_path}/merges.txt", )
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "sent" in self.config.n_model: import nltk try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt") from nltk.tokenize import sent_tokenize return PororoSentTokenizer(sent_tokenize, self.config) if self.config.n_model == "mecab_ko": try: import mecab except ModuleNotFoundError as error: raise error.__class__( "Please install python-mecab-ko with: `pip install python-mecab-ko`" ) model = mecab.MeCab() return PororoMecabKoTokenizer(model, self.config) if self.config.n_model == "char": return PororoCharTokenizer(self.config) if self.config.n_model == "jamo": return PororoJamoTokenizer(self.config) if self.config.n_model == "word": return PororoWordTokenizer(self.config) if self.config.n_model == "roberta": from fairseq.data.encoders.gpt2_bpe import get_encoder encoder = download_or_load("misc/encoder.json", self.config.lang) vocab = download_or_load("misc/vocab.bpe", self.config.lang) model = get_encoder(encoder, vocab) with open(encoder, "r") as f_vocab: vocab = json.load(f_vocab) inv_dict = {v: k for k, v in vocab.items()} return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config) if self.config.n_model == "moses": try: from sacremoses import MosesDetokenizer, MosesTokenizer except ModuleNotFoundError as error: raise error.__class__( "Please install sacremoses with: `pip install sacremoses`") model = MosesTokenizer(lang="en") detok = MosesDetokenizer(lang="en") return PororoMosesTokenizer(model, detok, self.config) if self.config.n_model == "jieba": try: import jieba except ModuleNotFoundError as error: raise error.__class__( "Please install jieba with: `pip install jieba`") model = jieba.cut return PororoJiebaTokenizer(model, self.config) if self.config.n_model == "mecab": try: import fugashi except ModuleNotFoundError as error: raise error.__class__( "Please install fugashi with: `pip install fugashi`") try: import ipadic except ModuleNotFoundError as error: raise error.__class__( "Please install ipadic with: `pip install ipadic`") dic_dir = ipadic.DICDIR mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = "-d {} -r {} ".format( dic_dir, mecabrc, ) model = fugashi.GenericTagger(mecab_option) return PororoMecabTokenizer(model, self.config) else: from pororo.tasks.utils.tokenizer import CustomTokenizer path = download_or_load( f"tokenizers/{self.config.n_model}.zip", self.config.lang, ) ext = "json" if "unigram" not in self.config.n_model else "txt" merges_filename = (f"{path}/merges.txt" if "unigram" not in self.config.n_model else None) model = CustomTokenizer.from_file( vocab_filename=f"{path}/vocab.{ext}", merges_filename=merges_filename, normalize=True if "jpe" not in self.config.n_model else False, ) if "jpe" in self.config.n_model: return PororoJamoPairTokenizer(model, self.config) if "mecab.bpe" in self.config.n_model: return PororoMecabSPTokenizer(model, self.config) return PororoSPTokenizer(model, self.config)
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "multi" in self.config.n_model: from fairseq.models.transformer import TransformerModel from pororo.tasks.utils.tokenizer import CustomTokenizer load_dict = download_or_load( f"transformer/{self.config.n_model}", "multi", ) model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) return PororoTransformerTransMulti( model, self.config, tokenizer, ) if "transformer" in self.config.n_model: from fairseq.models.transformer import TransformerModel load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) tokenizer = None model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) if self.config.lang != "zh": from pororo.tasks.utils.tokenizer import CustomTokenizer tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) return PororoTransformerParaphrase(model, self.config, tokenizer)