def __init__(self): """Constructs a Tokenizer for Juman++. """ from pyknp import Juman self.do_lower_case = False self._jumanpp = Juman()
def run(self): data = self.load() jumanpp = Juman() output = [] for _, row in data.iterrows(): zenkaku = jaconv.h2z(row["sentence"], ascii=True, digit=True) splited = [ mrph.midasi for mrph in jumanpp.analysis(zenkaku).mrph_list() ] if self.task_name == 'QA_B': qa_zenkaku = jaconv.h2z( f"{row['target']}の{row['aspect']}は{row['sentiment']}", ascii=True, digit=True, ) else: qa_zenkaku = " " qa_splited = [ mrph.midasi for mrph in jumanpp.analysis(qa_zenkaku).mrph_list() ] output.append({ "context": " ".join(splited), "qa": " ".join(qa_splited), "label": 1 }) self.dump(pd.DataFrame(output))
def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='juman', jumanrcfile='', jumanpp=False): self.command = command self.server = server self.port = port self.timeout = timeout self.option = option self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = (jumancommand == "jumanpp") or jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): sys.stderr.write("Can't read rcfile (%s)!\n" % self.rcfile) quit(1) if (self.jumanpp): self.juman = Jumanpp() else: self.juman = Juman(command=jumancommand, rcfile=jumanrcfile)
def analyzer(): bc = BertClient(ip='bertserving', output_fmt='list') client = Elasticsearch('elasticsearch:9200') texts = [] list_text = [] jumanpp = Juman() query = request.args.get('q') result = jumanpp.analysis(query) for mrph in result.mrph_list(): texts.append(mrph.midasi) list_text.append(" ".join(texts)) query_vector = bc.encode(list_text, is_tokenized=False)[0] script_query = { "script_score": { "query": { "match": { "source": "tb" } }, "script": { "source": "cosineSimilarity(params.query_vector, doc['question_vector']) + 1.0", "params": { "query_vector": query_vector } } } } response = client.search(index=INDEX_NAME, body={ "size": SEARCH_SIZE, "query": script_query }) return jsonify(response)
def __init__(self, command='knp', option='-tab', rcfile='', server=None, port=31000, timeout=30, pattern=r'(?:^|\n)EOS($|\n)', jumanrcfile='', juman_option='-e2 -B', juman_port=32000, juman_command='juman', jumanpp=False): self.use_jumanpp = (juman_command == "jumanpp") or jumanpp assert 'EOS' in pattern self.pattern = pattern self.EOS = 'EOS' # tab形式しかパースしない assert '-tab' in option if rcfile and not os.path.isfile(os.path.expanduser(rcfile)): sys.stderr.write("Can't read rcfile (%s)!\n" % rcfile) quit(1) # Setup Juman(++) assert port != juman_port juman_args = {'option': juman_option, 'rcfile': jumanrcfile, 'server':server, 'port':juman_port} if self.use_jumanpp: self.juman = Jumanpp(**juman_args) else: self.juman = Juman(**juman_args) # Setup KNP if server is not None: self.socket = Socket(server, port, option=option, timeout=timeout) self.query = partial(self.socket.query, pattern=pattern) else: if rcfile: option += " -r {}".format(rcfile) self.subprocess = Subprocess(command, option=option) self.query = partial(self.subprocess.query, pattern=pattern)
def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='jumanpp', jumanrcfile='', jumanoption='', jumanpp=True): self.command = command self.server = server self.port = port self.timeout = timeout self.options = option.split() self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(self.command) is None: raise Exception("Can't find KNP command: %s" % self.command) self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, option=jumanoption, jumanpp=self.jumanpp)
def _apply_jumanpp(self, inp: str) -> Tuple[str, str]: jumanpp = Juman(command=self.juman, option=self.juman_option) jumanpp_result = jumanpp.analysis(inp) jumanpp_out = jumanpp_result.spec() + 'EOS\n' jumanpp_conll_out = self._jumanpp2conll_one_sentence( jumanpp_out) + 'EOS\n' return jumanpp_out, jumanpp_conll_out
def main(DATA_ROOT): text_files = Path(DATA_ROOT).glob('**/*.txt') for text_file in text_files: with open(text_file) as f: content = f.read() content = re.sub(r"=+(.*?)=+", "\g<1>", content) content = re.sub(r"^\n", "", content, flags=re.MULTILINE) content = content.replace('<block>', '') content = content.replace('<math-element>', '') # In this case, 。 can be removed safely sentences = re.split(r"[。\n]", content) sentences = [line for line in sentences if len(line) != 0] sentences = [''.join(line.split()) for line in sentences] # Remove sentence which is not properly parsed val_sentences = [] offsets = [] juman = Juman() for sentence in tqdm(sentences): # Try to parse try: result = juman.analysis(sentence) except ValueError: print(sentence) except Exception as e: raise e current = 0 offset = [0 for _ in range(len(sentence))] for mrph in result.mrph_list(): current = current + len(mrph.midasi) try: offset[current - 1] = 1 except IndexError as e: print(sentence) print(current) for _mrph in result.mrph_list(): print(_mrph.midasi) raise e except Exception as e: raise e val_sentences.append(sentence) offsets.append(offset) results = (sentences, offsets) file_name = text_file.name[:-4] + '.pickle' dic = text_file.parent with open(Path(dic, file_name), 'wb') as f: pickle.dump(results, f)
def main(bert_vocab_filepath, example_filepath, context_filepath, cache_save_dir): #Juman++ juman = Juman(jumanpp=True) logger.info("Cache files will be saved in {}.".format(cache_save_dir)) #Tokenizer logger.info("Create a tokenizer from {}.".format(bert_vocab_filepath)) tokenizer = BertTokenizer.from_pretrained(bert_vocab_filepath, do_lower_case=False) logger.info("Start loading examples from {}.".format(example_filepath)) examples = load_examples(example_filepath) logger.info("Finished loading examples.") logger.info("Number of examples: {}".format(len(examples))) logger.info("Start loading contexts from {}.".format(context_filepath)) contexts = load_contexts(context_filepath) logger.info("Finished loading contexts.") logger.info("Start encoding examples.") encoding = encode_examples(juman, tokenizer, examples, contexts, 512) logger.info("Finished encoding examples.") os.makedirs(cache_save_dir, exist_ok=True) torch.save(encoding["input_ids"], os.path.join(cache_save_dir, "input_ids.pt")) torch.save(encoding["attention_mask"], os.path.join(cache_save_dir, "attention_mask.pt")) torch.save(encoding["token_type_ids"], os.path.join(cache_save_dir, "token_type_ids.pt")) torch.save(encoding["labels"], os.path.join(cache_save_dir, "labels.pt")) logger.info("Saved cache files in {}.".format(cache_save_dir))
def string_word_point(self, df): jumanpp = Juman(jumanpp=False) tmp_word =[] df_time_word = pd.DataFrame(index=[], columns=['time','word']) #単語と時間のdf df_word_point = pd.DataFrame(index=[], columns=['word','point'])#単語とその出現数のdf df_time_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時のコメント数のdf df_time_www_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時のwww数のdf df_time_hakusyu_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時の拍手数のdf df_URL_point = pd.DataFrame(index=[], columns=['URL','point'])#URLまとめdf #print(df_word_point) for i in range(len(df)): #URLだったら追加 url=URL_hanbetu(df['comment'][i]) if url != False: tmp = self.my_index(df_URL_point['URL'],url) df_URL_point = self.make_df_append(df_URL_point,tmp,url) #print("記号削除前") #print(df_word_point) #記号削除中 print(df['comment'][i]) df['comment'][i] = self.my_delete(df['comment'][i]) # h:m:s -> hms に変更 tmp_time = self.strtime_to_inttime(df['time'][i]) #時間ごとのコメント数計算 tmp = self.my_index(df_time_point['time'],tmp_time) df_time_point = self.make_df_append(df_time_point,tmp,tmp_time) #wwwがあったら1追加なかったら0追加 print(url) if False != self.www_hanbetu(df['comment'][i]) and url == False: df_time_www_point = self.make_df_append(df_time_www_point,tmp,tmp_time) else: if False == tmp : df_time_www_point = df_time_www_point.append({'time': tmp_time, 'point': 0}, ignore_index=True) #拍手があったら1追加なかったら0追加 if False != self.hakusyu_hanbetu(df['comment'][i]): df_time_hakusyu_point = self.make_df_append(df_time_hakusyu_point,tmp,tmp_time) else: if False == tmp : df_time_hakusyu_point = df_time_hakusyu_point.append({'time': tmp_time, 'point': 0}, ignore_index=True) #構文解析 result = jumanpp.analysis(df['comment'][i]) #print(result) #分析結果からdf作成 for token in result.mrph_list(): tmp_word = token.midasi #名詞の出現数計算 if 0 != self.word_Classification(token.hinsi): #名詞なら if self.word_Classification(token.hinsi) == '名詞': tmp = self.my_index(df_word_point['word'],tmp_word) df_word_point = self.make_df_append(df_word_point,tmp,tmp_word) #名詞とその時の時間 df_time_word = df_time_word.append({'time':tmp_time,'word': tmp_word}, ignore_index=True) return df_time_word,df_word_point,df_time_point,df_time_www_point, df_time_hakusyu_point,df_URL_point
def morphological_analysis(self, text): jumanpp = Juman() ret = [] text = self.remove_special_character(text) result = jumanpp.analysis(text) # これでスペースで単語が区切られる for mrph in result.mrph_list(): ret += self.modification(mrph.midasi) return ret
def __init__(self, bert_model: str, fine_tuned_model: str, jumanpp_command: str): self.jumanpp = Juman(command=jumanpp_command) self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=False) self.model = BertPosNegClassifier(bert_model) state_dict = torch.load(fine_tuned_model, map_location=torch.device('cpu')) self.model.load_state_dict({k.replace('module.', ''): v for k, v in state_dict.items()}) self.model.eval()
def counter(text, d): jumanapp = Juman() result = jumanapp.analysis(text) for mrph in result.mrph_list(): if mrph.genkei in d: d[mrph.genkei] = d[mrph.genkei] + 1 else: d[mrph.genkei] = 1
def __init__( self, preprocessor=None, stopwords=[], ): self.jumanpp = Juman() self.preprocessor = preprocessor self.stopwords = stopwords
def test_juman_wrapper(self): juman = Juman() result = juman.analysis(u"これはペンです。") print(','.join(mrph.midasi for mrph in result)) for mrph in result.mrph_list(): assert isinstance(mrph, pyknp.Morpheme) print(u"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \ % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
def __init__(self): """ 日本語専用トークナイザの構築。 JUMAN++ を使用する。 """ from pyknp import Juman self.do_lower_case = False self._jumanpp = Juman()
def juman_list(text): jumanpp = Juman() result = jumanpp.analysis(text) # アルファベットは全て "En" という文字列に置き換える wakati = [ mrph.genkei if mrph.bunrui != "アルファベット" else "En" for mrph in result.mrph_list() ] return ",".join(wakati)
def _morphological_analysis(tweet: str) -> List[str]: ''' tweetを形態素解析し、リストを返す ''' text = _remove_unnecessary(tweet) if not text: return [] return [mrph.genkei for mrph in Juman().analysis(text).mrph_list() if mrph.hinsi in ['名詞', '動詞', '形容詞', '接尾辞']]
def test_unknown_word(self): # disable_error実行前はエラーが発生する with self.assertRaises(KeyError): message = kotodama.transformVerb("嫌いだ",{"過去"}) kotodama.disableError(Juman()) # print(kotodama.transformVerb("嫌いだ",set())) self.assertEqual(kotodama.transformVerb("嫌いだ",set("過去")), '嫌いだ')
def get_repname_using_jumanpp(genkei: str, pos: str) -> str: if pos == '助詞': return f'{genkei}/{genkei}' juman = Juman(option='-s 1') mrphs = juman.analysis(genkei, juman_format=JUMAN_FORMAT.LATTICE_TOP_ONE) # 形態素解析が誤っていないか(=1形態素になっているか)をチェック if len(mrphs) == 1: return mrphs[0].repname return f'{genkei}/{genkei}'
def __init__(self): '''initialize Examples -------- >>> nlp = JNLP() None ''' self.juman = Juman() self.KNP = KNP(option='-tab -anaphora')
def test_juman_wrapper(self): try: juman = Juman(command=self.path_to_juman_command) result = juman.analysis("これはペンです。") logger.debug(','.join(mrph.midasi for mrph in result)) for mrph in result.mrph_list(): assert isinstance(mrph, pyknp.Morpheme) logger.debug("見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \ % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname)) except ImportError: print('skip test_juman_wrapper')
def seg2word(seg): len_split = 1000 # seg = seg_in.replace(' ', '\u3000') # seg = seg_in.replace(' ', ' ') len_seg = len(seg) seg_splits = [seg[i:i + len_split] for i in range(0, len_seg, len_split)] juman_def = Juman(command="/mnt/gold/users/s18153/bin/jumanpp") return ' '.join([ " ".join( [mrph.midasi for mrph in juman_def.analysis(seg_part).mrph_list()]) for seg_part in seg_splits ])
def bulk_predict(docs, batch_size=256): """Predict bert embeddings.""" jumanpp = Juman(jumanpp=False) for i in range(0, len(docs), batch_size): batch_docs = docs[i: i+batch_size] pre_embedding_docs = [] for doc in batch_docs: for k in range(0, len(doc['question']), MAX_TXT_LENGTH) result = jumanpp.analysis(doc['question'][k:k+MAX_TXT_LENGTH]) texts = [mrph.midasi for mrph in result.mrph_list()] pre_embedding_docs.append(" ".join(texts)) embeddings = bc.encode(pre_embedding_docs,is_tokenized=True) yield emb
def parser_func_juman(lemmatize: bool = True) -> Callable[[str], List[str]]: juman = Juman() if lemmatize: def f(s: str) -> List[str]: return [m.genkei for m in juman.analysis(s)] return f else: def g(s: str) -> List[str]: return [m.midasi for m in juman.analysis(s)] return g
def juman_wakati(text, hinshi=(), DEBUG=False, STEM_FLAG=False): juman = Juman() output = "" # wakati result = juman.analysis(text) for mrph in result.mrph_list(): if STEM_FLAG and mrph.hinsi in hinshi: output += mrph.repname.split("/")[0] + " " if DEBUG: print("stem:", mrph.repname) print("midashi:", mrph.repname) print("hinsi:", mrph.hinsi) print("yomi:", mrph.yomi) return output.strip()
def __init__( self, cls: Type["Defaults"], nlp: Optional[Language] = None, juman_kwargs: Optional[Dict[str, str]] = None, preprocessor: Optional[Callable[[str], str]] = han_to_zen_normalize, ): """ Args: juman_kwargs: passed to `pyknp.Juman.__init__` preprocessor: applied to text before tokenizing. `mojimoji.han_to_zen` is often used. """ from pyknp import Juman juman_kwargs = juman_kwargs or {} default_command = get_juman_command() assert default_command juman_kwargs.setdefault("command", default_command) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) self.tokenizer = Juman(**juman_kwargs) if juman_kwargs else Juman() self.juman_kwargs = juman_kwargs self.preprocessor = preprocessor
def __init__( self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='jumanpp', jumanrcfile='', jumanoption='', jumanpp=True, multithreading=False, ): self.command = command self.server = server self.port = port self.timeout = timeout self.options = option.split() self.rcfile = rcfile self.pattern = pattern if server is not None: self.analyzer = Analyzer(backend='socket', timeout=timeout, server=server, port=port, socket_option='RUN -tab -normal\n') else: cmds = [self.command] + self.options if self.rcfile: cmds += ['-r', self.rcfile] self.analyzer = Analyzer(backend='subprocess', multithreading=multithreading, timeout=timeout, command=cmds) self.jumanpp = jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(self.command) is None: raise Exception("Can't find KNP command: %s" % self.command) self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, option=jumanoption, jumanpp=self.jumanpp, multithreading=multithreading)
def word_distance(s1, s2): juman = Juman() r = len(s1 + s2) - len(re.sub("[a-zA-Z0-9]", "", s1 + s2)) if r > len((s1 + s2).replace(" ", "")) // 2: return word_distance_en(s1, s2) sss = [ set( [item.midasi for item in juman.analysis(ss).mrph_list() \ if item.hinsi in {'名詞', '動詞', '形容詞', '指示詞'}\ or '内容語' in item.imis ] ) for ss in [s1, s2] ] if min(len(sss[0]), len(sss[1])) == 0: return 0 return float(len(sss[0] & sss[1])) / min(len(sss[0]), len(sss[1]))
def title_clean(title_ls): tmp_ls = copy.deepcopy([title_ls]) for i in range(len(tmp_ls) - 1): if tmp_ls[i] is None: del tmp_ls[i] for i in range(len(tmp_ls)): tmp_ls[i] = normalize('NFKC', tmp_ls[i]) tmp_ls[i] = tmp_ls[i].replace(' ', '') tmp_ls[i] = re.sub(r'−.+?$', '', tmp_ls[i]) tmp_ls[i] = re.sub(r'ーY.+?$', '', tmp_ls[i]) tmp_ls[i] = re.sub(r'\|.+?$', '', tmp_ls[i]) jumanpp = Juman() sep_ls = [] for tmp in tmp_ls: sep_ls.append(' '.join([mrph.midasi for mrph in jumanpp.analysis(tmp)])) return sep_ls[0]