def __init__(self, do_lower_case=False, never_split=None, normalize_text=True, mecab_option: Optional[str] = None): """Constructs a MecabTokenizer. Args: **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. **never_split**: (`optional`) list of str Kept for backward compatibility purposes. Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) List of token not to split. **normalize_text**: (`optional`) boolean (default True) Whether to apply unicode normalization to text before tokenization. **mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "") """ self.do_lower_case = do_lower_case self.never_split = never_split if never_split is not None else [] self.normalize_text = normalize_text import fugashi import ipadic # Use ipadic by default (later options can override it) mecab_option = mecab_option or "" mecab_option = ipadic.MECAB_ARGS + " " + mecab_option self.mecab = fugashi.GenericTagger(mecab_option)
def __init__(self, mecab_option=None): import fugashi if mecab_option is None: import unidic_lite dic_dir = unidic_lite.DICDIR mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = "-d {} -r {}".format(dic_dir, mecabrc) self.mecab = fugashi.GenericTagger(mecab_option)
def tokenize(lines): tagger = fugashi.GenericTagger(ipadic.MECAB_ARGS + ' -Owakati') #lines_tokenized = [] for i, line in enumerate(lines): lines[i] = tagger(line) # tagger(line) is a list of fugashi nodes print(lines[i]) #lines_tokenized.append(token.surface for token in tagger(line)) #print(lines_tokenized) return lines
def main(): args = parse_args() if args.unidic: dicdir = args.dicdir or unidic_lite.DICDIR rcfile = os.path.join(dicdir, 'mecabrc') tokenizer = unidic_tokenize else: dicdir = args.dicdir or '/var/lib/mecab/dic/juman-utf8' rcfile = args.rcfile or '/etc/mecabrc' tokenizer = juman_tokenize assert dicdir and rcfile global tagger tagger = fugashi.GenericTagger(f'-r {rcfile} -d {dicdir}') charset = tagger.dictionary_info[0]['charset'] assert charset == 'utf-8' or charset == 'utf8' dataset = [] with gzip.open(args.rcqafile, "rt", encoding="utf-8") as fp: for line in fp: data = json.loads(line) if data["documents"]: dataset.append(data) train_dataset = [data for data in dataset if data["timestamp"] < "2009"] dev_dataset = [ data for data in dataset if "2009" <= data["timestamp"] < "2010" ] test_dataset = [data for data in dataset if "2010" <= data["timestamp"]] for filename, datasplit in (("rcqa_train.json", train_dataset), ("rcqa_dev.json", dev_dataset), ("rcqa_test.json", test_dataset)): entries = convert(datasplit, tokenizer, args.oldformat) with open(filename, "w", encoding="utf-8") as fp: json.dump({"data": entries}, fp, ensure_ascii=False)
def __init__(self, mecab_dic: Optional[str] = None, mecab_option: Optional[str] = None) -> None: import fugashi mecab_option = mecab_option or "" if mecab_dic is not None: if mecab_dic == "unidic_lite": import unidic_lite dic_dir = unidic_lite.DICDIR elif mecab_dic == "unidic": import unidic dic_dir = unidic.DICDIR elif mecab_dic == "ipadic": import ipadic dic_dir = ipadic.DICDIR else: raise ValueError("Invalid mecab_dic is specified.") mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = "-d {} -r {} ".format(dic_dir, mecabrc) + mecab_option self.mecab = fugashi.GenericTagger(mecab_option)
def __init__( self, do_lower_case=False, never_split=None, normalize_text=True, mecab_dic: Optional[str] = "ipadic", mecab_option: Optional[str] = None, ): """ Constructs a MecabTokenizer. Args: **do_lower_case**: (`optional`) boolean (default True) Whether to lowercase the input. **never_split**: (`optional`) list of str Kept for backward compatibility purposes. Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) List of tokens not to split. **normalize_text**: (`optional`) boolean (default True) Whether to apply unicode normalization to text before tokenization. **mecab_dic**: (`optional`) string (default "ipadic") Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary, set this option to `None` and modify `mecab_option`. **mecab_option**: (`optional`) string String passed to MeCab constructor. """ self.do_lower_case = do_lower_case self.never_split = never_split if never_split is not None else [] self.normalize_text = normalize_text try: import fugashi except ModuleNotFoundError as error: raise error.__class__( "You need to install fugashi to use MecabTokenizer. " "See https://pypi.org/project/fugashi/ for installation.") mecab_option = mecab_option or "" if mecab_dic is not None: if mecab_dic == "ipadic": try: import ipadic except ModuleNotFoundError as error: raise error.__class__( "The ipadic dictionary is not installed. " "See https://github.com/polm/ipadic-py for installation." ) dic_dir = ipadic.DICDIR elif mecab_dic == "unidic_lite": try: import unidic_lite except ModuleNotFoundError as error: raise error.__class__( "The unidic_lite dictionary is not installed. " "See https://github.com/polm/unidic-lite for installation." ) dic_dir = unidic_lite.DICDIR elif mecab_dic == "unidic": try: import unidic except ModuleNotFoundError as error: raise error.__class__( "The unidic dictionary is not installed. " "See https://github.com/polm/unidic-py for installation." ) dic_dir = unidic.DICDIR if not os.path.isdir(dic_dir): raise RuntimeError( "The unidic dictionary itself is not found." "See https://github.com/polm/unidic-py for installation." ) else: raise ValueError("Invalid mecab_dic is specified.") mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option self.mecab = fugashi.GenericTagger(mecab_option)
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if self.config.n_model == "g2p.ko": try: from g2pk import G2p as G2pK except ModuleNotFoundError as error: raise error.__class__( "Please install g2pk with: `pip install g2pk`") model = G2pK() return PororoG2PKo(model, self.config) if self.config.n_model == "g2p.en": try: from g2p_en import G2p as G2pE except ModuleNotFoundError as error: raise error.__class__( "Please install g2p_en with: `pip install g2p_en`") model = G2pE() return PororoG2PEn(model, self.config) if self.config.n_model == "g2p.zh": try: from g2pM import G2pM except ModuleNotFoundError as error: raise error.__class__( "Please install g2pM with: `pip install g2pM`") model = G2pM() return PororoG2PZh(model, self.config) if self.config.n_model == "g2p.ja": try: import fugashi except ModuleNotFoundError as error: raise error.__class__( "Please install fugashi with: `pip install fugashi`") try: import ipadic except ModuleNotFoundError as error: raise error.__class__( "Please install ipadic with: `pip install ipadic`") try: import romkan except ModuleNotFoundError as error: raise error.__class__( "Please install ipadic with: `pip install romkan`") dic_dir = ipadic.DICDIR mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = "-d {} -r {} ".format( dic_dir, mecabrc, ) tagger = fugashi.GenericTagger(mecab_option) return PororoG2PJa(tagger, romkan.to_roma, self.config)
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "sent" in self.config.n_model: import nltk try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt") from nltk.tokenize import sent_tokenize return PororoSentTokenizer(sent_tokenize, self.config) if self.config.n_model == "mecab_ko": try: import mecab except ModuleNotFoundError as error: raise error.__class__( "Please install python-mecab-ko with: `pip install python-mecab-ko`" ) model = mecab.MeCab() return PororoMecabKoTokenizer(model, self.config) if self.config.n_model == "char": return PororoCharTokenizer(self.config) if self.config.n_model == "jamo": return PororoJamoTokenizer(self.config) if self.config.n_model == "word": return PororoWordTokenizer(self.config) if self.config.n_model == "roberta": from fairseq.data.encoders.gpt2_bpe import get_encoder encoder = download_or_load("misc/encoder.json", self.config.lang) vocab = download_or_load("misc/vocab.bpe", self.config.lang) model = get_encoder(encoder, vocab) with open(encoder, "r") as f_vocab: vocab = json.load(f_vocab) inv_dict = {v: k for k, v in vocab.items()} return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config) if self.config.n_model == "moses": try: from sacremoses import MosesDetokenizer, MosesTokenizer except ModuleNotFoundError as error: raise error.__class__( "Please install sacremoses with: `pip install sacremoses`") model = MosesTokenizer(lang="en") detok = MosesDetokenizer(lang="en") return PororoMosesTokenizer(model, detok, self.config) if self.config.n_model == "jieba": try: import jieba except ModuleNotFoundError as error: raise error.__class__( "Please install jieba with: `pip install jieba`") model = jieba.cut return PororoJiebaTokenizer(model, self.config) if self.config.n_model == "mecab": try: import fugashi except ModuleNotFoundError as error: raise error.__class__( "Please install fugashi with: `pip install fugashi`") try: import ipadic except ModuleNotFoundError as error: raise error.__class__( "Please install ipadic with: `pip install ipadic`") dic_dir = ipadic.DICDIR mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = "-d {} -r {} ".format( dic_dir, mecabrc, ) model = fugashi.GenericTagger(mecab_option) return PororoMecabTokenizer(model, self.config) else: from pororo.tasks.utils.tokenizer import CustomTokenizer path = download_or_load( f"tokenizers/{self.config.n_model}.zip", self.config.lang, ) ext = "json" if "unigram" not in self.config.n_model else "txt" merges_filename = (f"{path}/merges.txt" if "unigram" not in self.config.n_model else None) model = CustomTokenizer.from_file( vocab_filename=f"{path}/vocab.{ext}", merges_filename=merges_filename, normalize=True if "jpe" not in self.config.n_model else False, ) if "jpe" in self.config.n_model: return PororoJamoPairTokenizer(model, self.config) if "mecab.bpe" in self.config.n_model: return PororoMecabSPTokenizer(model, self.config) return PororoSPTokenizer(model, self.config)
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if self.config.n_model == "nltk": import nltk try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt") try: nltk.data.find("taggers/averaged_perceptron_tagger") except LookupError: nltk.download("averaged_perceptron_tagger") return PororoNLTKPosTagger(nltk, self.config) if self.config.n_model == "mecab-ko": try: import mecab except ModuleNotFoundError as error: raise error.__class__( "Please install python-mecab-ko with: `pip install python-mecab-ko`" ) model = mecab.MeCab() return PororoMecabPos(model, self.config) if self.config.n_model == "mecab-ipadic": try: import fugashi except ModuleNotFoundError as error: raise error.__class__( "Please install fugashi with: `pip install fugashi`") try: import ipadic except ModuleNotFoundError as error: raise error.__class__( "Please install ipadic with: `pip install ipadic`") dic_dir = ipadic.DICDIR mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = "-d {} -r {} ".format( dic_dir, mecabrc, ) model = fugashi.GenericTagger(mecab_option) return PororoMecabJap(model, self.config) if self.config.n_model == "jieba": try: import jieba # noqa except ModuleNotFoundError as error: raise error.__class__( "Please install jieba with: `pip install jieba`") import jieba.posseg as jieba_pos model = jieba_pos return PororoJieba(model, self.config)
def __init__( self, do_lower_case=False, never_split=None, normalize_text=True, mecab_dic="ipadic", mecab_option=None, ): """ Constructs a MecabTokenizer. Args: do_lower_case (bool): Whether to lowercase the input. Defaults to`True`. never_split: (list): Kept for backward compatibility purposes. Defaults to`None`. normalize_text (bool): Whether to apply unicode normalization to text before tokenization. Defaults to`True`. mecab_dic (string): Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary, set this option to `None` and modify `mecab_option`. Defaults to`ipadic`. mecab_option (string): String passed to MeCab constructor. Defaults to`None`. """ self.do_lower_case = do_lower_case self.never_split = never_split if never_split is not None else [] self.normalize_text = normalize_text try: import fugashi except ModuleNotFoundError as error: raise error.__class__( "You need to install fugashi to use MecabTokenizer. " "See https://pypi.org/project/fugashi/ for installation.") mecab_option = mecab_option or "" if mecab_dic is not None: if mecab_dic == "ipadic": try: import ipadic except ModuleNotFoundError as error: raise error.__class__( "The ipadic dictionary is not installed. " "See https://github.com/polm/ipadic-py for installation." ) dic_dir = ipadic.DICDIR elif mecab_dic == "unidic_lite": try: import unidic_lite except ModuleNotFoundError as error: raise error.__class__( "The unidic_lite dictionary is not installed. " "See https://github.com/polm/unidic-lite for installation." ) dic_dir = unidic_lite.DICDIR elif mecab_dic == "unidic": try: import unidic except ModuleNotFoundError as error: raise error.__class__( "The unidic dictionary is not installed. " "See https://github.com/polm/unidic-py for installation." ) dic_dir = unidic.DICDIR if not os.path.isdir(dic_dir): raise RuntimeError( "The unidic dictionary itself is not found." "See https://github.com/polm/unidic-py for installation." ) else: raise ValueError("Invalid mecab_dic is specified.") mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option self.mecab = fugashi.GenericTagger(mecab_option)
import os import ipadic import fugashi dic_dir = ipadic.DICDIR mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = f"-d {dic_dir} -r {mecabrc}" tagger = fugashi.GenericTagger(mecab_option) def tokenize(text): """ A method for word segmentation. Parameters ---------- text : str An input text Returns ------- words : list A list of words """ words = [word.surface for word in tagger(text)] return words def original_usage(text):