def extractDialogWithSudachi(rootDir): outputDir = utils.getOutputPath(rootDir, "stats") tokenizer_obj = dictionary.Dictionary().create() mode = tokenizer.Tokenizer.SplitMode.C unigrams = [] bigrams = [] trigrams = [] fourgrams = [] POS_LIST = ["名詞", "動詞", "副詞", "形容詞", "連体詞", "形状詞"] for fn, fd in utils.loadFiles(rootDir): for line in fd: line = line.strip() wordList = [] for word in tokenizer_obj.tokenize(line, mode): if word.part_of_speech()[0] not in POS_LIST: continue wordList.append( (word.dictionary_form(), word.part_of_speech()[0])) print([ word.surface(), word.dictionary_form(), word.part_of_speech()[0] ]) unigrams.extend(getChunks(wordList, 1)) bigrams.extend(getChunks(wordList, 2)) trigrams.extend(getChunks(wordList, 3)) fourgrams.extend(getChunks(wordList, 4)) _output(outputDir, unigrams, bigrams, trigrams, fourgrams)
def __init__(self, hinshi_list: List[str] = None): """ :param hinshi_list: 使用する品詞のリスト. example) hinshi_list=["同詞", "名詞", "形容詞"] """ self.tokenizer_obj = dictionary.Dictionary().create() self.mode = tokenizer.Tokenizer.SplitMode.C self.hinshi_list = hinshi_list
def __init__(self, nlp, mode=SUDACHI_DEFAULT_SPLITMODE): self.nlp = nlp resources_path = Path(__file__).parent / "resources" config.RESOURCEDIR = str(resources_path) setting_path = resources_path / "sudachi.json" config.SETTINGFILE = str(setting_path) with open(str(setting_path), "r", encoding="utf-8") as f: settings = json.load(f) settings['systemDict'] = str( resources_path / settings.get('systemDict', 'system_core.dic')) settings['characterDefinitionFile'] = str( resources_path / settings.get('characterDefinitionFile', 'char.def')) if 'oovProviderPlugin' in settings: for plugin in settings['oovProviderPlugin']: if plugin[ 'class'] == 'com.worksap.nlp.sudachi.MeCabOovProviderPlugin': plugin['charDef'] = str(resources_path / plugin.get('charDef', 'char.def')) plugin['unkDef'] = str(resources_path / plugin.get('unkDef', 'unk.def')) dict_ = dictionary.Dictionary(settings) self.tokenizer = dict_.create() self.mode = mode self.use_sentence_separator = True
def load_sudachi(mode=None): if mode is None: mode = tokenizer.Tokenizer.SplitMode.C else: mode = eval('tokenizer.Tokenizer.SplitMode.{}'.format(mode)) t = dictionary.Dictionary().create(mode=mode) return t
def __init__(self): import json from sudachipy import tokenizer, dictionary, config with open(config.SETTINGFILE, "r", encoding="utf-8") as f: settings = json.load(f) self.tokenizer_obj = dictionary.Dictionary(settings).create() self.mode = tokenizer.Tokenizer.SplitMode.C
def __init__(self, sp_model_path, bos_eos=True): self._sudachi_tokenizer = dictionary.Dictionary().create() self._sudachi_mode = tokenizer.Tokenizer.SplitMode.A self._sp_tokenizer = spm.SentencePieceProcessor() self._sp_tokenizer.load(sp_model_path) if bos_eos: self._sp_tokenizer.set_encode_extra_options('bos:eos')
def __init__(self, split_mode=None): self.tokenizer = dictionary.Dictionary().create() if split_mode == 'A': self.split_mode = tokenizer.Tokenizer.SplitMode.A elif split_mode == 'B': self.split_mode = tokenizer.Tokenizer.SplitMode.B else: self.split_mode = tokenizer.Tokenizer.SplitMode.C
def __init__(self, nlp=None, mode=SUDACHI_DEFAULT_SPLITMODE): self.nlp = nlp self.vocab = nlp.vocab if nlp is not None else Vocab() dictionary = try_import_sudachipy_dictionary() dict_ = dictionary.Dictionary() self.tokenizer = dict_.create() self.mode = mode self.use_sentence_separator = True
def tokenizer_inst(self): from sudachipy import dictionary from sudachipy import config import json if self._tokenizer_obj is None: with open(config.SETTINGFILE, "r", encoding="utf-8") as f: settings = json.load(f) self._tokenizer_obj = dictionary.Dictionary(settings).create() return self._tokenizer_obj
def __init__(self, stop_words, normalize=False, mode=tokenizer.Tokenizer.SplitMode.B): super().__init__(stop_words, normalize) with open(config.SETTINGFILE, "r", encoding="utf-8") as f: settings = json.load(f) self.tokenizer = dictionary.Dictionary(settings).create() self.mode = mode
def __init__(self, mention_anchors: Tuple[str] = MENTION_ANCHORS ): ''' :param resource_save_dir: :param mention_anchors: ''' self.tokenizer = sudachiDic.Dictionary().create() self.mode = sudachiTokenizer.Tokenizer.SplitMode.B self.mention_anchors = mention_anchors
def tokenize(self, text): try: import sudachipy from sudachipy import dictionary except ImportError as e: raise ValueError("Sudachi tokenizer requires sudachipy.") segmenter = dictionary.Dictionary( config_path="conf/sudachi.json").create() tokens = segmenter.tokenize(text, sudachipy.tokenizer.Tokenizer.SplitMode.C) return [l.surface() for l in tokens]
def try_sudachi_import(): """SudachiPy is required for Japanese support, so check for it. It it's not available blow up and explain how to fix it.""" try: from sudachipy import dictionary, tokenizer tok = dictionary.Dictionary().create( mode=tokenizer.Tokenizer.SplitMode.A) return tok except ImportError: raise ImportError("Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy")
def __init__(self): with open(sudachipy.config.SETTINGFILE, 'r', encoding='utf-8') as f: sudachi_settings = json.load(f) dict = dictionary.Dictionary(sudachi_settings) self.sudachi_instance = dict.create() self.nameList = { '1': '一郎', '2': '二郎', '3': '三郎', '4': '四郎', '5': '五郎', }
def word_count(texts, exclude_list): tokenizer_obj = dictionary.Dictionary().create() mode = tokenizer.Tokenizer.SplitMode.C words = [] for text in texts: tokens = tokenizer_obj.tokenize(text, mode) for token in tokens: part_of_speech = token.part_of_speech()[0] if part_of_speech == '名詞' and token.dictionary_form( ) not in exclude_list: words.append(token.surface()) return words
def __init__(self, config): """ Construct a SudachiPy-based tokenizer. Note that this tokenizer uses regex for sentence segmentation. """ if config['lang'] != 'ja': raise Exception("SudachiPy tokenizer is only allowed in Japanese pipelines.") check_sudachipy() from sudachipy import tokenizer from sudachipy import dictionary self.tokenizer = dictionary.Dictionary().create()
def init(cls, mode: Literal["A", "B", "C"], dic=None): from sudachipy import dictionary, tokenizer if not mode in {"A", "B", "C"}: raise ValueError(mode) _mode = getattr(tokenizer.Tokenizer.SplitMode, mode) if dic is None: dic = dictionary.Dictionary().create() cls.mode = mode cls.sudachi = dic cls._mode = _mode
def __init__(self, path, sudachiDataPath="sudachiData.pickle"): f = open(path, 'r') self.file = f self.reader = csv.reader(f, delimiter=' ') # 最初に含有単語リストやメモリアドレスリストを作成する(かなり時間かかる) # 2回目以降はpickle化したものを読み込む if os.path.exists(sudachiDataPath): with open(sudachiDataPath, 'rb') as f: dataset = pickle.load(f) self.offset_list = dataset["offset_list"] self.emb_size = dataset["emb_size"] self.word2index = dataset["word2index"] self.ave_vec = dataset["ave_vec"] else: txt = f.readline() # 分散表現の次元数 self.emb_size = int(txt.split()[1]) # 未知語が来た場合平均ベクトルを返す self.ave_vec = np.zeros(self.emb_size, np.float) # メモリアドレスリスト self.offset_list = [] word_list = [] count = 0 maxCount = int(txt.split()[0]) while True: count += 1 self.offset_list.append(f.tell()) if count % 100000 == 0: print(count, "/", maxCount) line = f.readline() if line == '': break line_list = line.split() word_list.append(line_list[0]) self.ave_vec += np.array(line_list[-300:]).astype(np.float) self.offset_list.pop() self.ave_vec = self.ave_vec / count self.word2index = {v: k for k, v in enumerate(word_list)} dataset = {} dataset["offset_list"] = self.offset_list dataset["emb_size"] = self.emb_size dataset["word2index"] = self.word2index dataset["ave_vec"] = self.ave_vec with open(sudachiDataPath, 'wb') as f: pickle.dump(dataset, f) self.num_rows = len(self.offset_list) # sudachiの準備 self.tokenizer_obj = dictionary.Dictionary().create() self.mode = tokenizer.Tokenizer.SplitMode.B
def get_token(self, source): """ 形態素解析(Suadchi) Args: source ([str]): [対象の文] Returns: [List[str]]: [形態素解析した単語のリスト] """ tokenizer_obj = dictionary.Dictionary().create() result = [ m.surface() for m in tokenizer_obj.tokenize(source, self.mode) ] return result
def __init__(self, mode: str, with_postag: bool, **kwargs) -> None: """ Initializer for SudachiTokenizer Parameters --- mode (str) Splitting mode which controls a granuality ofkonoha.token. (mode should be `A`, `B` or `C`) For more information, see following links. - document: https://github.com/WorksApplications/Sudachi#the-modes-of-splitting # NOQA - paper: http://www.lrec-conf.org/proceedings/lrec2018/summaries/8884.html # NOQA with_postag (bool=False) flag determines ifkonoha.tokenizer include pos tags. **kwargs others. """ try: from sudachipy import tokenizer from sudachipy import dictionary except ImportError: msg = "importing sudachipy failed for some reason." msg += "\n 1. make sure SudachiPy is successfully installed." msg += "\n 2. make sure dictionary is successfully installed." raise ImportError(msg) super(SudachiTokenizer, self).__init__( name="sudachi ({})".format(mode), with_postag=with_postag, ) try: self._tokenizer = dictionary.Dictionary().create() except KeyError: msg = "please install dictionary" msg += " ( see https://github.com/WorksApplications/SudachiPy#install-dict-packages )" # NOQA raise KeyError(msg) _mode = mode.capitalize() if _mode == "A": self._mode = tokenizer.Tokenizer.SplitMode.A elif _mode == "B": self._mode = tokenizer.Tokenizer.SplitMode.B elif _mode == "C": self._mode = tokenizer.Tokenizer.SplitMode.C else: raise ValueError( "Invalid mode is specified. Mode should be A, B, or C." ) # NOQA
def __init__(self, nlp=None, mode=SUDACHIPY_DEFAULT_SPLIT_MODE, config_path=None): self.nlp = nlp self.vocab = nlp.vocab if nlp is not None else Vocab() dictionary = try_import_sudachipy_dictionary() split_mode = sudachipy_split_mode(mode) if not config_path: config_path = dict_package_path() / 'sudachi.json' dict_ = dictionary.Dictionary(config_path=config_path) self.tokenizer = dict_.create(mode=split_mode) self._mode = mode self.use_sentence_separator = True self.enable_ex_sudachi = False
def get_word_freqs(texts): """ 文字列のリストを入力すると,すべての文字列に対して形態素解析を行い, 正規化された単語の出現回数と,形態素解析の結果を辞書型で返す. 無視される品詞:補助記号,空白,助動詞,助詞,代名詞,接頭辞,接尾辞 - return { <正規化された単語>: { "count": <出現回数>, "raws": [<token>, <token>, ...] } } """ # URLの除外 texts = [ re.sub(r'(http|https)://([-\w]+\.)+[-\w]+(/[-\w./?%&=]*)?', "", text) for text in texts ] word_freqs = {} # 形態素解析器の生成 tokenizer_obj = dictionary.Dictionary().create() mode = tokenizer.Tokenizer.SplitMode.C print("\ntokenizing ...") for text in tqdm(texts): tokens = tokenizer_obj.tokenize(text, mode) for token in tokens: # 特定の品詞をスキップする pos = token.part_of_speech()[0] if pos in ["補助記号", "空白", "助動詞", "助詞", "代名詞", "接頭辞", "接尾辞"]: continue normalized_token = token.normalized_form() if normalized_token in word_freqs: # 既出の単語をカウント word_freqs[normalized_token]["count"] += 1 word_freqs[normalized_token]["raws"] += [token] else: # 初登場の単語をカウント word_freqs[normalized_token] = {} word_freqs[normalized_token]["count"] = 1 word_freqs[normalized_token]["raws"] = [token] return word_freqs
def __init__(self, nlp=None, mode=SUDACHI_DEFAULT_SPLITMODE): self.nlp = nlp self.vocab = nlp.vocab if nlp is not None else Vocab() dictionary = try_import_sudachipy_dictionary() split_mode_enum = try_import_sudachipy_split_mode() if mode == 'A': split_mode = split_mode_enum.A elif mode == 'B': split_mode = split_mode_enum.B elif mode == 'C': split_mode = split_mode_enum.C else: raise Exception('mode must be A, B, or C ({})'.format(str(mode))) dict_ = dictionary.Dictionary() self.tokenizer = dict_.create(mode=split_mode) self.use_sentence_separator = True
def __init__(self, mode: str, with_postag: bool, **kwargs): """ Initializer for SudachiTokenizer Parameters --- mode (str) Splitting mode which controls a granuality oftiny_tokenizer.token. (mode should be `A`, `B` or `C`) For more information, see following links. - document: https://github.com/WorksApplications/Sudachi#the-modes-of-splitting # NOQA - paper: http://www.lrec-conf.org/proceedings/lrec2018/summaries/8884.html # NOQA with_postag (bool=False) flag determines iftiny_tokenizer.tokenizer include pos tags. **kwargs others. """ super(SudachiTokenizer, self).__init__(f"sudachi ({mode})") try: from sudachipy import tokenizer from sudachipy import dictionary except ModuleNotFoundError: raise ModuleNotFoundError("sudachipy is not installed") try: self.tokenizer = dictionary.Dictionary().create() except KeyError: msg = "please install dictionary" msg += " ( see https://github.com/WorksApplications/SudachiPy#install-dict-packages )" # NOQA raise KeyError(msg) _mode = mode.capitalize() if _mode == "A": self.mode = tokenizer.Tokenizer.SplitMode.A elif _mode == "B": self.mode = tokenizer.Tokenizer.SplitMode.B elif _mode == "C": self.mode = tokenizer.Tokenizer.SplitMode.C else: msg = "Invalid mode is specified. Mode should be 'A', 'B' or 'C'" raise ValueError(msg) self.with_postag = with_postag
def try_sudachi_import(split_mode="A"): """SudachiPy is required for Japanese support, so check for it. It it's not available blow up and explain how to fix it. split_mode should be one of these values: "A", "B", "C", None->"A".""" try: from sudachipy import dictionary, tokenizer split_mode = { None: tokenizer.Tokenizer.SplitMode.A, "A": tokenizer.Tokenizer.SplitMode.A, "B": tokenizer.Tokenizer.SplitMode.B, "C": tokenizer.Tokenizer.SplitMode.C, }[split_mode] tok = dictionary.Dictionary().create(mode=split_mode) return tok except ImportError: raise ImportError( "Japanese support requires SudachiPy and SudachiDict-core " "(https://github.com/WorksApplications/SudachiPy). " "Install with `pip install sudachipy sudachidict_core` or " "install spaCy with `pip install spacy[ja]`.")
def __init__(self, mode: str) -> None: from sudachipy import dictionary from sudachipy import tokenizer super().__init__(name="sudachi ({})".format(mode)) try: self._tokenizer = dictionary.Dictionary().create() except KeyError: msg = "Loading a dictionary fails." msg += " ( see https://github.com/WorksApplications/SudachiPy#install-dict-packages )" # NOQA raise KeyError(msg) _mode = mode.capitalize() if _mode == "A": self._mode = tokenizer.Tokenizer.SplitMode.A elif _mode == "B": self._mode = tokenizer.Tokenizer.SplitMode.B elif _mode == "C": self._mode = tokenizer.Tokenizer.SplitMode.C else: raise ValueError("Invalid mode is specified. Mode should be A, B, or C.") # NOQA
def get_token(self, source) : with open(sudachipy.config.SETTINGFILE, "r", encoding="utf-8") as f: settings = json.load(f) tokenizer_obj = dictionary.Dictionary(settings).create() mode = tokenizer.Tokenizer.SplitMode.C result = [m.surface() for m in tokenizer_obj.tokenize(mode,source)] word_list = [] for mrph in result: if not (mrph == ""): norm_word = tokenizer_obj.tokenize(mode,mrph)[0].normalized_form() hinsi = tokenizer_obj.tokenize(mode,norm_word)[0].part_of_speech()[0] # 単語の正規表現が特定の品詞の場合のみ採用する if hinsi in ["名詞", "動詞", "形容詞"]: word = tokenizer_obj.tokenize(mode,norm_word)[0].dictionary_form() word_list.append(word) return word_list
def __init__(self): super().__init__() INPUT_PRETRAINED_VECTORS = "/data/chive_v1.2mc90/chive-1.2-mc90_gensim/chive-1.2-mc90.kv" pretrained_vectors = gensim.models.KeyedVectors.load( INPUT_PRETRAINED_VECTORS) # vectors を用いたフィルタリング words_found_in_pretrained_vectors = set( [t for t in pretrained_vectors.vocab.keys()]) # 全テキストのうち、vector に含まれる語のみ残し、vocab とする self.tokenizer = dictionary.Dictionary().create() # tokenize token_sequences = [] for instance in tqdm(self.text): token_sequence = [ token.surface() for token in self.tokenizer.tokenize(instance) ] token_sequences.append(token_sequence) # 学習用コーパスと学習済み分散表現に含まれる語の Vocab を作成する。 counter = Counter() for token_sequence in tqdm(token_sequences): counter.update(words_found_in_pretrained_vectors & set(token_sequence)) self.vocab = torchtext.vocab.vocab(counter) self.vocab.insert_token("<pad>", 0) self.vocab.insert_token("<unk>", 1) self.vocab.set_default_index(1) # token 列を id 列に変換 self.id_sequences = [] for token_sequence in token_sequences: self.id_sequences.append( torch.tensor([self.vocab[token] for token in token_sequence])) # 学習用 vectors を作成する。軽量化のため学習用データ語彙との積集合のみ使用。 vectors_for_unk_and_pad = np.zeros((2, 300)) itos = self.vocab.get_itos() words = [itos[i] for i in range(len(self.vocab))] self.vectors = np.concatenate( (vectors_for_unk_and_pad, np.array([pretrained_vectors[w] for w in words[2:]])), axis=0)
import os from sudachipy import dictionary as sudachi_dict VERBOSE = False PARAPHRASE = False WEIGHTED_SIMILARITY = False TOKENIZER = sudachi_dict.Dictionary().create() PROJECT_DIR = os.path.dirname(os.path.realpath('__file__')) SIMILARITY = None WIKI_STATS = None
def main(args): with open(args.akama_file, encoding='utf8') as f: style_lines = f.read().rstrip().split('\n') header = style_lines[0] del style_lines[0] entry_to_sents = {} for line in style_lines: comps = line.split(',') entry_to_sents[comps[0]] = [] entry_to_sents[comps[1]] = [] tokenizer_obj = dictionary.Dictionary(args.sudachipy_config).create() with open(args.corpus, encoding='utf8') as f: for i, line in enumerate(f): line = line.rstrip() if i % 1000 == 0: print(f'Processing line {i}') morphemes = tokenizer_obj.tokenize(line) for m in morphemes: all_pos = m.part_of_speech() for pos_i in range(min(2, len(all_pos)) - 1, -1, -1): entry = m.surface() + '/' + all_pos[pos_i] if entry in entry_to_sents: entry_to_sents[entry].append(line) break for l in entry_to_sents.values(): # keep the shortest sentences l.sort(key=lambda s: len(s)) if len(l) > args.sentences_per_pair * 2: del l[args.sentences_per_pair * 2:] random.shuffle(l) found, not_found = 0, 0 total_list_len = 0 for key, l in entry_to_sents.items(): if l: found += 1 total_list_len += len(l) else: not_found += 1 print( f'Found {found} / {found + not_found} entries, avg list len {total_list_len / found}' ) found_pairs = 0 for split in ['dev', 'test']: with open(getattr(args, 'out_path_' + split), 'w', encoding='utf8') as f: wr = csv.writer(f, quoting=csv.QUOTE_ALL) wr.writerow(['sentence 1', 'sentence 2'] + header.split(',')) for line in style_lines: comps = line.split(',') entry1, entry2 = comps[0], comps[1] found = False right_center = random.randint(0, 1) half1, half2 = (len(entry_to_sents[entry1]) + right_center) // 2, \ (len(entry_to_sents[entry2]) + right_center) // 2 if split == 'dev': l1, l2 = entry_to_sents[entry1][:half1], entry_to_sents[ entry2][:half2] else: l1, l2 = entry_to_sents[entry1][half1:], entry_to_sents[ entry2][half2:] for entry1_sent in l1: for entry2_sent in l2: found = True wr.writerow([entry1_sent] + [entry2_sent] + [entry1] + [entry2] + comps[2:]) if found: found_pairs += 1 print(f'Found {found_pairs} / {len(style_lines)} entry pairs')