Пример #1
0
def extractDialogWithSudachi(rootDir):
    outputDir = utils.getOutputPath(rootDir, "stats")
    tokenizer_obj = dictionary.Dictionary().create()
    mode = tokenizer.Tokenizer.SplitMode.C

    unigrams = []
    bigrams = []
    trigrams = []
    fourgrams = []
    POS_LIST = ["名詞", "動詞", "副詞", "形容詞", "連体詞", "形状詞"]
    for fn, fd in utils.loadFiles(rootDir):
        for line in fd:
            line = line.strip()
            wordList = []
            for word in tokenizer_obj.tokenize(line, mode):
                if word.part_of_speech()[0] not in POS_LIST:
                    continue
                wordList.append(
                    (word.dictionary_form(), word.part_of_speech()[0]))
                print([
                    word.surface(),
                    word.dictionary_form(),
                    word.part_of_speech()[0]
                ])

            unigrams.extend(getChunks(wordList, 1))
            bigrams.extend(getChunks(wordList, 2))
            trigrams.extend(getChunks(wordList, 3))
            fourgrams.extend(getChunks(wordList, 4))

    _output(outputDir, unigrams, bigrams, trigrams, fourgrams)
 def __init__(self, hinshi_list: List[str] = None):
     """
     :param hinshi_list: 使用する品詞のリスト. example) hinshi_list=["同詞", "名詞", "形容詞"]
     """
     self.tokenizer_obj = dictionary.Dictionary().create()
     self.mode = tokenizer.Tokenizer.SplitMode.C
     self.hinshi_list = hinshi_list
Пример #3
0
    def __init__(self, nlp, mode=SUDACHI_DEFAULT_SPLITMODE):
        self.nlp = nlp

        resources_path = Path(__file__).parent / "resources"
        config.RESOURCEDIR = str(resources_path)
        setting_path = resources_path / "sudachi.json"
        config.SETTINGFILE = str(setting_path)

        with open(str(setting_path), "r", encoding="utf-8") as f:
            settings = json.load(f)
        settings['systemDict'] = str(
            resources_path / settings.get('systemDict', 'system_core.dic'))
        settings['characterDefinitionFile'] = str(
            resources_path /
            settings.get('characterDefinitionFile', 'char.def'))
        if 'oovProviderPlugin' in settings:
            for plugin in settings['oovProviderPlugin']:
                if plugin[
                        'class'] == 'com.worksap.nlp.sudachi.MeCabOovProviderPlugin':
                    plugin['charDef'] = str(resources_path /
                                            plugin.get('charDef', 'char.def'))
                    plugin['unkDef'] = str(resources_path /
                                           plugin.get('unkDef', 'unk.def'))

        dict_ = dictionary.Dictionary(settings)
        self.tokenizer = dict_.create()
        self.mode = mode
        self.use_sentence_separator = True
Пример #4
0
def load_sudachi(mode=None):
    if mode is None:
        mode = tokenizer.Tokenizer.SplitMode.C
    else:
        mode = eval('tokenizer.Tokenizer.SplitMode.{}'.format(mode))
    t = dictionary.Dictionary().create(mode=mode)
    return t
Пример #5
0
 def __init__(self):
     import json
     from sudachipy import tokenizer, dictionary, config
     with open(config.SETTINGFILE, "r", encoding="utf-8") as f:
         settings = json.load(f)
     self.tokenizer_obj = dictionary.Dictionary(settings).create()
     self.mode = tokenizer.Tokenizer.SplitMode.C
Пример #6
0
 def __init__(self, sp_model_path, bos_eos=True):
     self._sudachi_tokenizer = dictionary.Dictionary().create()
     self._sudachi_mode = tokenizer.Tokenizer.SplitMode.A
     self._sp_tokenizer = spm.SentencePieceProcessor()
     self._sp_tokenizer.load(sp_model_path)
     if bos_eos:
         self._sp_tokenizer.set_encode_extra_options('bos:eos')
Пример #7
0
 def __init__(self, split_mode=None):
     self.tokenizer = dictionary.Dictionary().create()
     if split_mode == 'A':
         self.split_mode = tokenizer.Tokenizer.SplitMode.A
     elif split_mode == 'B':
         self.split_mode = tokenizer.Tokenizer.SplitMode.B
     else:
         self.split_mode = tokenizer.Tokenizer.SplitMode.C
Пример #8
0
    def __init__(self, nlp=None, mode=SUDACHI_DEFAULT_SPLITMODE):
        self.nlp = nlp
        self.vocab = nlp.vocab if nlp is not None else Vocab()
        dictionary = try_import_sudachipy_dictionary()

        dict_ = dictionary.Dictionary()
        self.tokenizer = dict_.create()
        self.mode = mode
        self.use_sentence_separator = True
Пример #9
0
 def tokenizer_inst(self):
     from sudachipy import dictionary
     from sudachipy import config
     import json
     if self._tokenizer_obj is None:
         with open(config.SETTINGFILE, "r", encoding="utf-8") as f:
             settings = json.load(f)
             self._tokenizer_obj = dictionary.Dictionary(settings).create()
     return self._tokenizer_obj
Пример #10
0
 def __init__(self,
              stop_words,
              normalize=False,
              mode=tokenizer.Tokenizer.SplitMode.B):
     super().__init__(stop_words, normalize)
     with open(config.SETTINGFILE, "r", encoding="utf-8") as f:
         settings = json.load(f)
     self.tokenizer = dictionary.Dictionary(settings).create()
     self.mode = mode
Пример #11
0
 def __init__(self,
              mention_anchors: Tuple[str] = MENTION_ANCHORS
 ):
     '''
     :param resource_save_dir:
     :param mention_anchors:
     '''
     self.tokenizer = sudachiDic.Dictionary().create()
     self.mode = sudachiTokenizer.Tokenizer.SplitMode.B
     self.mention_anchors = mention_anchors
Пример #12
0
 def tokenize(self, text):
     try:
         import sudachipy
         from sudachipy import dictionary
     except ImportError as e:
         raise ValueError("Sudachi tokenizer requires sudachipy.")
     segmenter = dictionary.Dictionary(
         config_path="conf/sudachi.json").create()
     tokens = segmenter.tokenize(text,
                                 sudachipy.tokenizer.Tokenizer.SplitMode.C)
     return [l.surface() for l in tokens]
Пример #13
0
def try_sudachi_import():
    """SudachiPy is required for Japanese support, so check for it.
    It it's not available blow up and explain how to fix it."""
    try:
        from sudachipy import dictionary, tokenizer

        tok = dictionary.Dictionary().create(
            mode=tokenizer.Tokenizer.SplitMode.A)
        return tok
    except ImportError:
        raise ImportError("Japanese support requires SudachiPy: "
                          "https://github.com/WorksApplications/SudachiPy")
Пример #14
0
 def __init__(self):
     with open(sudachipy.config.SETTINGFILE, 'r', encoding='utf-8') as f:
         sudachi_settings = json.load(f)
     dict = dictionary.Dictionary(sudachi_settings)
     self.sudachi_instance = dict.create()
     self.nameList = {
         '1': '一郎',
         '2': '二郎',
         '3': '三郎',
         '4': '四郎',
         '5': '五郎',
     }
Пример #15
0
def word_count(texts, exclude_list):
    tokenizer_obj = dictionary.Dictionary().create()
    mode = tokenizer.Tokenizer.SplitMode.C
    words = []
    for text in texts:
        tokens = tokenizer_obj.tokenize(text, mode)
        for token in tokens:
            part_of_speech = token.part_of_speech()[0]
            if part_of_speech == '名詞' and token.dictionary_form(
            ) not in exclude_list:
                words.append(token.surface())
    return words
Пример #16
0
    def __init__(self, config):
        """ Construct a SudachiPy-based tokenizer.

        Note that this tokenizer uses regex for sentence segmentation.
        """
        if config['lang'] != 'ja':
            raise Exception("SudachiPy tokenizer is only allowed in Japanese pipelines.")

        check_sudachipy()
        from sudachipy import tokenizer
        from sudachipy import dictionary

        self.tokenizer = dictionary.Dictionary().create()
Пример #17
0
    def init(cls, mode: Literal["A", "B", "C"], dic=None):
        from sudachipy import dictionary, tokenizer

        if not mode in {"A", "B", "C"}:
            raise ValueError(mode)
        _mode = getattr(tokenizer.Tokenizer.SplitMode, mode)

        if dic is None:
            dic = dictionary.Dictionary().create()

        cls.mode = mode
        cls.sudachi = dic
        cls._mode = _mode
Пример #18
0
    def __init__(self, path, sudachiDataPath="sudachiData.pickle"):
        f = open(path, 'r')
        self.file = f
        self.reader = csv.reader(f, delimiter=' ')
        # 最初に含有単語リストやメモリアドレスリストを作成する(かなり時間かかる)
        # 2回目以降はpickle化したものを読み込む
        if os.path.exists(sudachiDataPath):
            with open(sudachiDataPath, 'rb') as f:
                dataset = pickle.load(f)
            self.offset_list = dataset["offset_list"]
            self.emb_size = dataset["emb_size"]
            self.word2index = dataset["word2index"]
            self.ave_vec = dataset["ave_vec"]
        else:
            txt = f.readline()
            # 分散表現の次元数
            self.emb_size = int(txt.split()[1])
            # 未知語が来た場合平均ベクトルを返す
            self.ave_vec = np.zeros(self.emb_size, np.float)
            # メモリアドレスリスト
            self.offset_list = []
            word_list = []
            count = 0
            maxCount = int(txt.split()[0])
            while True:
                count += 1
                self.offset_list.append(f.tell())
                if count % 100000 == 0: print(count, "/", maxCount)
                line = f.readline()
                if line == '': break
                line_list = line.split()
                word_list.append(line_list[0])
                self.ave_vec += np.array(line_list[-300:]).astype(np.float)
            self.offset_list.pop()
            self.ave_vec = self.ave_vec / count
            self.word2index = {v: k for k, v in enumerate(word_list)}

            dataset = {}
            dataset["offset_list"] = self.offset_list
            dataset["emb_size"] = self.emb_size
            dataset["word2index"] = self.word2index
            dataset["ave_vec"] = self.ave_vec
            with open(sudachiDataPath, 'wb') as f:
                pickle.dump(dataset, f)

        self.num_rows = len(self.offset_list)
        # sudachiの準備
        self.tokenizer_obj = dictionary.Dictionary().create()
        self.mode = tokenizer.Tokenizer.SplitMode.B
Пример #19
0
    def get_token(self, source):
        """ 形態素解析(Suadchi)

        Args:
            source ([str]): [対象の文]

        Returns:
            [List[str]]: [形態素解析した単語のリスト]
        """
        tokenizer_obj = dictionary.Dictionary().create()
        result = [
            m.surface() for m in tokenizer_obj.tokenize(source, self.mode)
        ]

        return result
Пример #20
0
    def __init__(self, mode: str, with_postag: bool, **kwargs) -> None:
        """
        Initializer for SudachiTokenizer

        Parameters
        ---
        mode (str)
            Splitting mode which controls a granuality ofkonoha.token.
            (mode should be `A`, `B` or `C`)
            For more information, see following links.
            - document: https://github.com/WorksApplications/Sudachi#the-modes-of-splitting  # NOQA
            - paper: http://www.lrec-conf.org/proceedings/lrec2018/summaries/8884.html  # NOQA
        with_postag (bool=False)
            flag determines ifkonoha.tokenizer include pos tags.
        **kwargs
            others.
        """
        try:
            from sudachipy import tokenizer
            from sudachipy import dictionary
        except ImportError:
            msg = "importing sudachipy failed for some reason."
            msg += "\n  1. make sure SudachiPy is successfully installed."
            msg += "\n  2. make sure dictionary is successfully installed."
            raise ImportError(msg)

        super(SudachiTokenizer, self).__init__(
            name="sudachi ({})".format(mode),
            with_postag=with_postag,
        )
        try:
            self._tokenizer = dictionary.Dictionary().create()
        except KeyError:
            msg = "please install dictionary"
            msg += " ( see https://github.com/WorksApplications/SudachiPy#install-dict-packages )"  # NOQA
            raise KeyError(msg)

        _mode = mode.capitalize()
        if _mode == "A":
            self._mode = tokenizer.Tokenizer.SplitMode.A
        elif _mode == "B":
            self._mode = tokenizer.Tokenizer.SplitMode.B
        elif _mode == "C":
            self._mode = tokenizer.Tokenizer.SplitMode.C
        else:
            raise ValueError(
                "Invalid mode is specified. Mode should be A, B, or C."
            )  # NOQA
Пример #21
0
    def __init__(self,
                 nlp=None,
                 mode=SUDACHIPY_DEFAULT_SPLIT_MODE,
                 config_path=None):
        self.nlp = nlp
        self.vocab = nlp.vocab if nlp is not None else Vocab()
        dictionary = try_import_sudachipy_dictionary()

        split_mode = sudachipy_split_mode(mode)
        if not config_path:
            config_path = dict_package_path() / 'sudachi.json'
        dict_ = dictionary.Dictionary(config_path=config_path)
        self.tokenizer = dict_.create(mode=split_mode)
        self._mode = mode
        self.use_sentence_separator = True
        self.enable_ex_sudachi = False
Пример #22
0
def get_word_freqs(texts):
    """
    文字列のリストを入力すると,すべての文字列に対して形態素解析を行い,
    正規化された単語の出現回数と,形態素解析の結果を辞書型で返す.
    無視される品詞:補助記号,空白,助動詞,助詞,代名詞,接頭辞,接尾辞
    - return
        {
            <正規化された単語>: {
                "count": <出現回数>,
                "raws": [<token>, <token>, ...]
            }
        }
    """

    # URLの除外
    texts = [
        re.sub(r'(http|https)://([-\w]+\.)+[-\w]+(/[-\w./?%&=]*)?', "", text)
        for text in texts
    ]

    word_freqs = {}

    # 形態素解析器の生成
    tokenizer_obj = dictionary.Dictionary().create()
    mode = tokenizer.Tokenizer.SplitMode.C

    print("\ntokenizing ...")
    for text in tqdm(texts):
        tokens = tokenizer_obj.tokenize(text, mode)
        for token in tokens:
            # 特定の品詞をスキップする
            pos = token.part_of_speech()[0]
            if pos in ["補助記号", "空白", "助動詞", "助詞", "代名詞", "接頭辞", "接尾辞"]:
                continue

            normalized_token = token.normalized_form()
            if normalized_token in word_freqs:
                # 既出の単語をカウント
                word_freqs[normalized_token]["count"] += 1
                word_freqs[normalized_token]["raws"] += [token]
            else:
                # 初登場の単語をカウント
                word_freqs[normalized_token] = {}
                word_freqs[normalized_token]["count"] = 1
                word_freqs[normalized_token]["raws"] = [token]

    return word_freqs
Пример #23
0
    def __init__(self, nlp=None, mode=SUDACHI_DEFAULT_SPLITMODE):
        self.nlp = nlp
        self.vocab = nlp.vocab if nlp is not None else Vocab()
        dictionary = try_import_sudachipy_dictionary()

        split_mode_enum = try_import_sudachipy_split_mode()
        if mode == 'A':
            split_mode = split_mode_enum.A
        elif mode == 'B':
            split_mode = split_mode_enum.B
        elif mode == 'C':
            split_mode = split_mode_enum.C
        else:
            raise Exception('mode must be A, B, or C ({})'.format(str(mode)))
        dict_ = dictionary.Dictionary()
        self.tokenizer = dict_.create(mode=split_mode)
        self.use_sentence_separator = True
Пример #24
0
    def __init__(self, mode: str, with_postag: bool, **kwargs):
        """
        Initializer for SudachiTokenizer

        Parameters
        ---
        mode (str)
            Splitting mode which controls a granuality oftiny_tokenizer.token.
            (mode should be `A`, `B` or `C`)
            For more information, see following links.
            - document: https://github.com/WorksApplications/Sudachi#the-modes-of-splitting  # NOQA
            - paper: http://www.lrec-conf.org/proceedings/lrec2018/summaries/8884.html  # NOQA
        with_postag (bool=False)
            flag determines iftiny_tokenizer.tokenizer include pos tags.
        **kwargs
            others.
        """
        super(SudachiTokenizer, self).__init__(f"sudachi ({mode})")
        try:
            from sudachipy import tokenizer
            from sudachipy import dictionary
        except ModuleNotFoundError:
            raise ModuleNotFoundError("sudachipy is not installed")
        try:
            self.tokenizer = dictionary.Dictionary().create()
        except KeyError:
            msg = "please install dictionary"
            msg += " ( see https://github.com/WorksApplications/SudachiPy#install-dict-packages )"  # NOQA
            raise KeyError(msg)

        _mode = mode.capitalize()
        if _mode == "A":
            self.mode = tokenizer.Tokenizer.SplitMode.A
        elif _mode == "B":
            self.mode = tokenizer.Tokenizer.SplitMode.B
        elif _mode == "C":
            self.mode = tokenizer.Tokenizer.SplitMode.C
        else:
            msg = "Invalid mode is specified. Mode should be 'A', 'B' or 'C'"
            raise ValueError(msg)

        self.with_postag = with_postag
Пример #25
0
def try_sudachi_import(split_mode="A"):
    """SudachiPy is required for Japanese support, so check for it.
    It it's not available blow up and explain how to fix it.
    split_mode should be one of these values: "A", "B", "C", None->"A"."""
    try:
        from sudachipy import dictionary, tokenizer
        split_mode = {
            None: tokenizer.Tokenizer.SplitMode.A,
            "A": tokenizer.Tokenizer.SplitMode.A,
            "B": tokenizer.Tokenizer.SplitMode.B,
            "C": tokenizer.Tokenizer.SplitMode.C,
        }[split_mode]
        tok = dictionary.Dictionary().create(mode=split_mode)
        return tok
    except ImportError:
        raise ImportError(
            "Japanese support requires SudachiPy and SudachiDict-core "
            "(https://github.com/WorksApplications/SudachiPy). "
            "Install with `pip install sudachipy sudachidict_core` or "
            "install spaCy with `pip install spacy[ja]`.")
Пример #26
0
    def __init__(self, mode: str) -> None:
        from sudachipy import dictionary
        from sudachipy import tokenizer
        super().__init__(name="sudachi ({})".format(mode))

        try:
            self._tokenizer = dictionary.Dictionary().create()
        except KeyError:
            msg = "Loading a dictionary fails."
            msg += " ( see https://github.com/WorksApplications/SudachiPy#install-dict-packages )"  # NOQA
            raise KeyError(msg)

        _mode = mode.capitalize()
        if _mode == "A":
            self._mode = tokenizer.Tokenizer.SplitMode.A
        elif _mode == "B":
            self._mode = tokenizer.Tokenizer.SplitMode.B
        elif _mode == "C":
            self._mode = tokenizer.Tokenizer.SplitMode.C
        else:
            raise ValueError("Invalid mode is specified. Mode should be A, B, or C.")  # NOQA
    def get_token(self, source) :
        
        with open(sudachipy.config.SETTINGFILE, "r", encoding="utf-8") as f:
            settings = json.load(f)
        tokenizer_obj = dictionary.Dictionary(settings).create()

        mode = tokenizer.Tokenizer.SplitMode.C
        result = [m.surface() for m in tokenizer_obj.tokenize(mode,source)]

        word_list = []
        for mrph in result:
            if not (mrph == ""):
                norm_word = tokenizer_obj.tokenize(mode,mrph)[0].normalized_form()
                hinsi = tokenizer_obj.tokenize(mode,norm_word)[0].part_of_speech()[0] 

                # 単語の正規表現が特定の品詞の場合のみ採用する
                if hinsi in  ["名詞", "動詞", "形容詞"]:
                    word = tokenizer_obj.tokenize(mode,norm_word)[0].dictionary_form()
                    word_list.append(word)

        return word_list
Пример #28
0
 def __init__(self):
     super().__init__()
     INPUT_PRETRAINED_VECTORS = "/data/chive_v1.2mc90/chive-1.2-mc90_gensim/chive-1.2-mc90.kv"
     pretrained_vectors = gensim.models.KeyedVectors.load(
         INPUT_PRETRAINED_VECTORS)
     # vectors を用いたフィルタリング
     words_found_in_pretrained_vectors = set(
         [t for t in pretrained_vectors.vocab.keys()])
     # 全テキストのうち、vector に含まれる語のみ残し、vocab とする
     self.tokenizer = dictionary.Dictionary().create()
     # tokenize
     token_sequences = []
     for instance in tqdm(self.text):
         token_sequence = [
             token.surface() for token in self.tokenizer.tokenize(instance)
         ]
         token_sequences.append(token_sequence)
     # 学習用コーパスと学習済み分散表現に含まれる語の Vocab を作成する。
     counter = Counter()
     for token_sequence in tqdm(token_sequences):
         counter.update(words_found_in_pretrained_vectors
                        & set(token_sequence))
     self.vocab = torchtext.vocab.vocab(counter)
     self.vocab.insert_token("<pad>", 0)
     self.vocab.insert_token("<unk>", 1)
     self.vocab.set_default_index(1)
     # token 列を id 列に変換
     self.id_sequences = []
     for token_sequence in token_sequences:
         self.id_sequences.append(
             torch.tensor([self.vocab[token] for token in token_sequence]))
     # 学習用 vectors を作成する。軽量化のため学習用データ語彙との積集合のみ使用。
     vectors_for_unk_and_pad = np.zeros((2, 300))
     itos = self.vocab.get_itos()
     words = [itos[i] for i in range(len(self.vocab))]
     self.vectors = np.concatenate(
         (vectors_for_unk_and_pad,
          np.array([pretrained_vectors[w] for w in words[2:]])),
         axis=0)
Пример #29
0
import os
from sudachipy import dictionary as sudachi_dict

VERBOSE = False
PARAPHRASE = False
WEIGHTED_SIMILARITY = False
TOKENIZER = sudachi_dict.Dictionary().create()
PROJECT_DIR = os.path.dirname(os.path.realpath('__file__'))
SIMILARITY = None
WIKI_STATS = None
Пример #30
0
def main(args):
    with open(args.akama_file, encoding='utf8') as f:
        style_lines = f.read().rstrip().split('\n')
        header = style_lines[0]
        del style_lines[0]

    entry_to_sents = {}

    for line in style_lines:
        comps = line.split(',')
        entry_to_sents[comps[0]] = []
        entry_to_sents[comps[1]] = []

    tokenizer_obj = dictionary.Dictionary(args.sudachipy_config).create()

    with open(args.corpus, encoding='utf8') as f:
        for i, line in enumerate(f):
            line = line.rstrip()
            if i % 1000 == 0:
                print(f'Processing line {i}')
            morphemes = tokenizer_obj.tokenize(line)
            for m in morphemes:
                all_pos = m.part_of_speech()
                for pos_i in range(min(2, len(all_pos)) - 1, -1, -1):
                    entry = m.surface() + '/' + all_pos[pos_i]
                    if entry in entry_to_sents:
                        entry_to_sents[entry].append(line)
                        break

    for l in entry_to_sents.values():
        # keep the shortest sentences
        l.sort(key=lambda s: len(s))
        if len(l) > args.sentences_per_pair * 2:
            del l[args.sentences_per_pair * 2:]
        random.shuffle(l)

    found, not_found = 0, 0
    total_list_len = 0
    for key, l in entry_to_sents.items():
        if l:
            found += 1
            total_list_len += len(l)
        else:
            not_found += 1

    print(
        f'Found {found} / {found + not_found} entries, avg list len {total_list_len / found}'
    )

    found_pairs = 0

    for split in ['dev', 'test']:
        with open(getattr(args, 'out_path_' + split), 'w',
                  encoding='utf8') as f:
            wr = csv.writer(f, quoting=csv.QUOTE_ALL)
            wr.writerow(['sentence 1', 'sentence 2'] + header.split(','))

            for line in style_lines:
                comps = line.split(',')
                entry1, entry2 = comps[0], comps[1]
                found = False

                right_center = random.randint(0, 1)
                half1, half2 = (len(entry_to_sents[entry1]) + right_center) // 2, \
                               (len(entry_to_sents[entry2]) + right_center) // 2
                if split == 'dev':
                    l1, l2 = entry_to_sents[entry1][:half1], entry_to_sents[
                        entry2][:half2]
                else:
                    l1, l2 = entry_to_sents[entry1][half1:], entry_to_sents[
                        entry2][half2:]

                for entry1_sent in l1:
                    for entry2_sent in l2:
                        found = True
                        wr.writerow([entry1_sent] + [entry2_sent] + [entry1] +
                                    [entry2] + comps[2:])

                if found:
                    found_pairs += 1

    print(f'Found {found_pairs} / {len(style_lines)} entry pairs')