Пример #1
0
    def __init__(
        self,
        vocab_file,
        tokenizer_path,
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        **kwargs,
    ):
        super().__init__()
        self.bos_token = bos_token
        self.eos_token = eos_token
        self.unk_token = unk_token
        self.pad_token = pad_token
        self.mask_token = mask_token

        self.tokenizer = CustomTokenizer.from_file(
            vocab_filename=f"{tokenizer_path}/vocab.json",
            merges_filename=f"{tokenizer_path}/merges.txt",
        )

        brain_tokens = [bos_token, pad_token, eos_token, unk_token]

        with open(vocab_file, "r") as r:
            brain_tokens.extend(
                [line.strip().split()[0] for line in r.readlines()][3:])
        brain_tokens.append(mask_token)

        self.brain_tok2idx = {tok: idx for idx, tok in enumerate(brain_tokens)}
        self.brain_idx2tok = {idx: tok for idx, tok in enumerate(brain_tokens)}
Пример #2
0
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        from pororo.tasks import PororoTokenizationFactory

        sent_tokenizer = (lambda text, lang: PororoTokenizationFactory(
            task="tokenization",
            lang=lang,
            model=f"sent_{lang}",
        ).load(device).predict(text))

        if "multi" in self.config.n_model:
            from fairseq.models.transformer import TransformerModel

            from pororo.tasks.utils.tokenizer import CustomTokenizer

            load_dict = download_or_load(
                f"transformer/{self.config.n_model}",
                self.config.lang,
            )

            model = (TransformerModel.from_pretrained(
                model_name_or_path=load_dict.path,
                checkpoint_file=f"{self.config.n_model}.pt",
                data_name_or_path=load_dict.dict_path,
                source_lang=load_dict.src_dict,
                target_lang=load_dict.tgt_dict,
            ).eval().to(device))

            tokenizer = CustomTokenizer.from_file(
                vocab_filename=f"{load_dict.src_tok}/vocab.json",
                merges_filename=f"{load_dict.src_tok}/merges.txt",
            )

            if "mtpg" in self.config.n_model:
                langtok_style = "mbart"
            elif "m2m" in self.config.n_model:
                langtok_style = "multilingual"
            else:
                langtok_style = "basic"

            return PororoTransformerTransMulti(
                model,
                self.config,
                tokenizer,
                sent_tokenizer,
                langtok_style,
            )
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """

        if "charbert" in self.config.n_model:
            from pororo.models.brainbert import CharBrainRobertaModel

            model = (CharBrainRobertaModel.load_model(
                f"bert/{self.config.n_model}",
                self.config.lang,
            ).eval().to(device))
            print(
                "As of now, this beta model tries to correct spacing errors in Korean text."
            )
            return PororoBertSpacing(model, self.config)

        if "transformer" in self.config.n_model:
            from fairseq.models.transformer import TransformerModel

            from pororo.tasks.utils.tokenizer import CustomTokenizer

            load_dict = download_or_load(
                f"transformer/{self.config.n_model}",
                self.config.lang,
            )

            tokenizer = None
            model = (TransformerModel.from_pretrained(
                model_name_or_path=load_dict.path,
                checkpoint_file=f"{self.config.n_model}.pt",
                data_name_or_path=load_dict.dict_path,
                source_lang=load_dict.src_dict,
                target_lang=load_dict.tgt_dict,
            ).eval().to(device))

            if "char" in self.config.n_model:
                return PororoTransformerGecChar(model, self.config)

            if load_dict.src_tok:
                tokenizer = CustomTokenizer.from_file(
                    vocab_filename=f"{load_dict.src_tok}/vocab.json",
                    merges_filename=f"{load_dict.src_tok}/merges.txt",
                )

            return PororoTransformerGec(model, tokenizer, device, self.config)
Пример #4
0
 def __init__(self, args, task, model, tok_path):
     super().__init__(args, task, model)
     self.bpe = CustomTokenizer.from_file(
         vocab_filename=f"{tok_path}/vocab.json",
         merges_filename=f"{tok_path}/merges.txt",
     )
Пример #5
0
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "sent" in self.config.n_model:
            import nltk

            try:
                nltk.data.find("tokenizers/punkt")
            except LookupError:
                nltk.download("punkt")

            from nltk.tokenize import sent_tokenize

            return PororoSentTokenizer(sent_tokenize, self.config)

        if self.config.n_model == "mecab_ko":
            try:
                import mecab
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install python-mecab-ko with: `pip install python-mecab-ko`"
                )
            model = mecab.MeCab()
            return PororoMecabKoTokenizer(model, self.config)

        if self.config.n_model == "char":
            return PororoCharTokenizer(self.config)

        if self.config.n_model == "jamo":
            return PororoJamoTokenizer(self.config)

        if self.config.n_model == "word":
            return PororoWordTokenizer(self.config)

        if self.config.n_model == "roberta":
            from fairseq.data.encoders.gpt2_bpe import get_encoder

            encoder = download_or_load("misc/encoder.json", self.config.lang)
            vocab = download_or_load("misc/vocab.bpe", self.config.lang)
            model = get_encoder(encoder, vocab)

            with open(encoder, "r") as f_vocab:
                vocab = json.load(f_vocab)
                inv_dict = {v: k for k, v in vocab.items()}

            return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config)

        if self.config.n_model == "moses":
            try:
                from sacremoses import MosesDetokenizer, MosesTokenizer
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install sacremoses with: `pip install sacremoses`")
            model = MosesTokenizer(lang="en")
            detok = MosesDetokenizer(lang="en")
            return PororoMosesTokenizer(model, detok, self.config)

        if self.config.n_model == "jieba":
            try:
                import jieba
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install jieba with: `pip install jieba`")
            model = jieba.cut
            return PororoJiebaTokenizer(model, self.config)

        if self.config.n_model == "mecab":
            try:
                import fugashi
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install fugashi with: `pip install fugashi`")

            try:
                import ipadic
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install ipadic with: `pip install ipadic`")

            dic_dir = ipadic.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(
                dic_dir,
                mecabrc,
            )
            model = fugashi.GenericTagger(mecab_option)
            return PororoMecabTokenizer(model, self.config)
        else:
            from pororo.tasks.utils.tokenizer import CustomTokenizer

            path = download_or_load(
                f"tokenizers/{self.config.n_model}.zip",
                self.config.lang,
            )

            ext = "json" if "unigram" not in self.config.n_model else "txt"
            merges_filename = (f"{path}/merges.txt" if "unigram"
                               not in self.config.n_model else None)

            model = CustomTokenizer.from_file(
                vocab_filename=f"{path}/vocab.{ext}",
                merges_filename=merges_filename,
                normalize=True if "jpe" not in self.config.n_model else False,
            )
            if "jpe" in self.config.n_model:
                return PororoJamoPairTokenizer(model, self.config)
            if "mecab.bpe" in self.config.n_model:
                return PororoMecabSPTokenizer(model, self.config)
            return PororoSPTokenizer(model, self.config)
Пример #6
0
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "multi" in self.config.n_model:
            from fairseq.models.transformer import TransformerModel

            from pororo.tasks.utils.tokenizer import CustomTokenizer

            load_dict = download_or_load(
                f"transformer/{self.config.n_model}",
                "multi",
            )

            model = (TransformerModel.from_pretrained(
                model_name_or_path=load_dict.path,
                checkpoint_file=f"{self.config.n_model}.pt",
                data_name_or_path=load_dict.dict_path,
                source_lang=load_dict.src_dict,
                target_lang=load_dict.tgt_dict,
            ).eval().to(device))

            tokenizer = CustomTokenizer.from_file(
                vocab_filename=f"{load_dict.src_tok}/vocab.json",
                merges_filename=f"{load_dict.src_tok}/merges.txt",
            )

            return PororoTransformerTransMulti(
                model,
                self.config,
                tokenizer,
            )

        if "transformer" in self.config.n_model:
            from fairseq.models.transformer import TransformerModel

            load_dict = download_or_load(
                f"transformer/{self.config.n_model}",
                self.config.lang,
            )

            tokenizer = None
            model = (TransformerModel.from_pretrained(
                model_name_or_path=load_dict.path,
                checkpoint_file=f"{self.config.n_model}.pt",
                data_name_or_path=load_dict.dict_path,
                source_lang=load_dict.src_dict,
                target_lang=load_dict.tgt_dict,
            ).eval().to(device))

            if self.config.lang != "zh":
                from pororo.tasks.utils.tokenizer import CustomTokenizer

                tokenizer = CustomTokenizer.from_file(
                    vocab_filename=f"{load_dict.src_tok}/vocab.json",
                    merges_filename=f"{load_dict.src_tok}/merges.txt",
                )

            return PororoTransformerParaphrase(model, self.config, tokenizer)