def analysis_bunsetu(doc): """ginzaの係り受け解析結果を受け取り、文節クラスのリスト(Chunk)を返す """ chunks = [] # 文節Head番号=(形態素解析した時のトークンindex)を取得 bunsetu_head_list = ginza.bunsetu_head_list(doc) # 文節に分解 for i, bunsetu in enumerate(ginza.bunsetu_spans(doc)): # 文節を表すChunkクラスをインスタンス化 chunk = Chunk() # 文節を形態素解析 morphs = get_morphs(bunsetu) # chunkのメンバ変数にappend chunk.morphs = morphs chunks.append(chunk) # 係り受け解析 for token in bunsetu.lefts: # 文節トークンが係る元の文節indexを取得 chunk_idx = get_bunsetu_head_list_index(bunsetu_head_list, token.i) chunks[chunk_idx].dst = i chunks[i].srcs.append(chunk_idx) return chunks
def main(): st.set_page_config(layout="wide", initial_sidebar_state="expanded") st.title("GiNZA NLP Library") toc = Toc() toc.placeholder(True) input_list = st.text_area("入力文字列", '銀座でランチをご一緒しましょう。今度の日曜日はどうですか。\n吾輩は猫である。 名前はまだ無い。 ').splitlines() ignore_lf = st.checkbox("改行を無視して1回で解析する。", False) if not st.button("実行"): st.stop() return if ignore_lf: input_list = ["".join(input_list)] with st.spinner(f'Wait for it...'): nlp = spacy.load('ja_ginza') # time.sleep(1.0) for i, input_str in enumerate(input_list): doc = nlp(input_str) for j, sent in enumerate(doc.sents): toc.subheader(f"{i + 1}-{j + 1}. {sent}") svg2 = spacy.displacy.render(create_manual(sent), style="dep", options={"compact": True, "offset_x": 200, "distance": 175}, manual=True) st.image(svg2, width=(len(sent) + 1) * 120) df = pd.DataFrame(index=[], columns=["i(index)", "orth(テキスト)", "lemma(基本形)", "reading_form(読みカナ)", "pos(PartOfSpeech)", "pos(品詞)", "tag(品詞詳細)", "inflection(活用情報)", "ent_type(エンティティ型)", "ent_iob(エンティティIOB)", "lang(言語)", "dep(dependency)", "dep(構文従属関係)", "head.i(親index)", "bunsetu_bi_label", "bunsetu_position_type", "is_bunsetu_head", "ent_label_ontonotes", "ent_label_ene"]) for token in sent: row = pd.DataFrame([token.i, token.orth_, token.lemma_, ginza.reading_form(token), token.pos_, DICT_POS_JP.get(token.pos_, token.pos_), token.tag_, ginza.inflection(token) or "-", token.ent_type_ or "-", token.ent_iob_ or "-", token.lang_, token.dep_, DICT_DEP_JP.get(token.dep_, token.dep_), token.head.i, ginza.bunsetu_bi_label(token), ginza.bunsetu_position_type(token), ginza.is_bunsetu_head(token), ginza.ent_label_ontonotes(token) or "-", ginza.ent_label_ene(token) or "-", ], index=df.columns).T df = df.append(row, ignore_index=True) st.table(df.T) st.subheader("文節区切り") bunsetu_list = ginza.bunsetu_spans(sent) st.text("/".join([bunsetu.orth_ for bunsetu in bunsetu_list])) st.subheader("文節の主辞区間と句の区分") st.text("/".join([f"{phrase}({phrase.label_})" for phrase in ginza.bunsetu_phrase_spans(sent)])) st.subheader("固有表現(エンティティ)") if sent.ents: svg_ent = spacy.displacy.render(sent, style="ent") stc.html(svg_ent) else: st.text("No Entity") toc.generate() toc.generate()
idf = 1 / corpus_count.get(mrph.midasi, 10) tfidf += (tf * idf) candidates.append([s, tfidf]) candidates = sorted(candidates, key=lambda x: x[1]) cand_sentences = list(map(lambda x: x[0], candidates)) cand_sentences = cand_sentences[-summary_count:] cand_sentences.append( title_similar_sentence(test_data["title"], test_data["body"])) summary_list = [] for s in cand_sentences[-summary_count:]: doc = nlp(s) summary = "" for sent in doc.sents: for t in bunsetu_spans(sent): for b in bunsetu(t.root, join_func=lambda tokens: tokens): if b.dep_ in [ "nsubj", "obj", "ROOT", "acl", "nmod", "compound", "nummod" ]: summary += b.lemma_ summary_list.append(summary) for i, s in enumerate(summary_list): if i < summary_count - 1: print(str(i) + ". " + s) else: print("タイトルに最も一致する一文 : " + s)
with open('./ai.ja.txt') as f: for line in f: line = line.strip() if line == "": continue chunks = [] # 解析 doc = nlp(line) # 文節Head番号=(形態素解析した時のトークンindex)を取得 bunsetu_head_list = ginza.bunsetu_head_list(doc) # 文節に分解 for i, bunsetu in enumerate(ginza.bunsetu_spans(doc)): # 文節を表すChunkクラスをインスタンス化 chunk = Chunk() # 文節を形態素解析 morphs = get_morphs(bunsetu) # chunkのメンバ変数にappend chunk.morphs = morphs chunks.append(chunk) # 係り受け解析 for token in bunsetu.lefts: # 文節トークンが係る元の文節indexを取得 chunk_idx = get_bunsetu_head_list_index( bunsetu_head_list, token.i) chunks[chunk_idx].dst = i
def _ginza_bunsetu(self, sentence): return [(chunk.text, chunk.label_) for chunk in ginza.bunsetu_spans(self.nlp(sentence))]
#When faced with several errors, upon running, I had to update pip and install python3-devel via yum (CentOS 7) #Most of code from sample at: https://www.megagon.ai/jp/blog/ginza-version-4-0/ import spacy import ginza nlp = spacy.load("ja_ginza") doc = nlp("東京オリンピックは2021年に開催されています。") print(ginza.bunsetu_spans(doc)) print("===============") for np in doc.noun_chunks: print(np)