class MorphAnalysis: def __init__(self): self.stop_path = str(pathlib.Path( __file__).resolve().parent) + '/data/stopwords_slothlib.txt' self.stopwords = [] with open(self.stop_path, 'r') as f: self.stopwords = f.read().split() # 形態素解析 self.jumanpp = Jumanpp() def to_wakati(self, text, allow_word_class=[ '名詞', '指示詞', '動詞', '形容詞', '判定詞', '助動詞', '副詞', '助詞', '接続詞', '連体詞', '感動詞', '接頭辞', '特殊', '未定義語' ], remove_stopwords=False, genkei=False): wkt = "" text = mojimoji.han_to_zen(text) rst = self.jumanpp.analysis(text) for mrph in rst.mrph_list(): # midasi, yomi, genkei, hinsi, bunrui, katuyou1, katuyou2, imis, repname if remove_stopwords and (mrph.genkei in self.stopwords): continue if mrph.hinsi in allow_word_class: if genkei: wkt += mrph.genkei + ' ' else: wkt += mrph.midasi + ' ' return wkt
def get_keitaiso_list_from_juman(text): """ textを形態素解析して返す mecabでできない表記揺れの問題をjumanだと解決できる """ jumanpp = Jumanpp() keitaiso_list = [] hinshi_list = [] exclusive_word_list = get_exclusive_word_list() # スペースがあるとエラー。先頭に#があると処理が動かなくなる(なんでだろう) text = text.replace(" ", "").replace(" ", "").replace("#", "/") result = jumanpp.analysis(unicode(text, 'utf-8')) # pyknp-Jumanではユニコード文字列しか処理されない try: for mrph in result.mrph_list(): keitaiso = mrph.genkei.encode('utf-8') hinshi = mrph.hinsi.encode('utf-8') # 形態素が設定した品詞リストやゴミワードリストに含まれるとき、数字のときにはスキップ if not is_valid_word_class(hinshi) or keitaiso in exclusive_word_list or keitaiso.isdigit(): continue keitaiso_list.append(keitaiso) hinshi_list.append(hinshi) except: print traceback.print_exc() return [keitaiso_list, hinshi_list]
def append_repname(words): """ :param words: a list of Word instances :return: a list of Word instances with preprocessed words with the representative expressions """ n_word = len(words) juman = Jumanpp() bar = progressbar.ProgressBar() for i in bar(range(n_word), max_value=n_word): word = words[i] if word.uid != i: continue # already merged repname_set = [] r = juman.analysis(word.p_surface) for mrph in r.mrph_list(): if mrph.bunrui == '数詞': repname_set.append([kansuji2arabic(mrph.midasi)]) elif mrph.repnames() != '': repname_set.append(mrph.repnames().split('?')) else: repname_set.append([mrph.midasi]) words[i].alias.extend(expand_ambiguity(repname_set)) return words
def main(): if len(sys.argv) != 2: print('need one argument for a file.') return file_name = sys.argv[1] vocab_dict = defaultdict(int) juman = Jumanpp() with open(file_name, 'r', encoding='utf-8', newline='') as fr: text = fr.readlines() for line in text: # juman++ doesn't support half-width character line = line.replace(' ', ' ') line.translate( str.maketrans( {chr(0x0021 + i): chr(0xFF01 + i) for i in range(94)})) analysis = juman.analysis(line.replace('\n', '')) for m in analysis.mrph_list(): vocab_dict[str(m.midasi)] += 1 sorted_dict = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True) print(sorted_dict) print(len(sorted_dict))
def read_and_anlyze_text(): sys.stdin = codecs.getreader('utf_8')(sys.stdin) sys.stdout = codecs.getwriter('utf_8')(sys.stdout) jumanpp = Jumanpp() midasis = [] repnames = [] repname_counts = {} wikipedia_redirections = [] w_rs = [] w_r_counts = {} row_result = [] while True: input_ = sys.stdin.readline() if input_ == '': break else: input_ = input_.strip() if input_ == '': continue result = jumanpp.analysis(input_) for mrph in result.mrph_list(): if not repname_counts.has_key(mrph.repname): repname_counts[mrph.repname] = 0 if (not mrph.midasi in midasis) and (mrph.repname != u""): repname_counts[mrph.repname] += 1 w_r = get_wikipedia_redirection(mrph.imis) if not w_r: w_r = mrph.midasi if not w_r_counts.has_key(w_r): w_r_counts[w_r] = 0 if (not mrph.midasi in midasis): w_r_counts[w_r] += 1 midasis.append(mrph.midasi) repnames.append(mrph.repname) wikipedia_redirections.append(w_r) w_rs.append(w_r) midasis.append("\n") repnames.append("\n") wikipedia_redirections.append(None) w_rs.append("\n") repname_counts["\n"] = 0 w_r_counts["\n"] = 0 row_result.append(result.spec()) yure_result = [] for i, midasi in enumerate(midasis): yure = False if repname_counts[repnames[i]] > 1 or w_r_counts[w_rs[i]] > 1: yure = True yure_result.append({ "midasi": midasi, "repname": repnames[i], "wikipedia_redirection": wikipedia_redirections[i], "repname_count": repname_counts[repnames[i]], "w_r_count": w_r_counts[w_rs[i]], "yure": yure }) return row_result, yure_result
def analysis_text(self, text, debug=None): jumanpp = Jumanpp() #There may be unknown error in jumanpp. what... try: result = jumanpp.analysis(text) except: return None if debug: self.__print_analyzed(result) return result
def read_and_anlyze_text(): sys.stdin = codecs.getreader('utf_8')(sys.stdin) sys.stdout = codecs.getwriter('utf_8')(sys.stdout) jumanpp = Jumanpp() midasis = [] repnames = [] repname_counts = {} wikipedia_redirections = [] w_rs = [] w_r_counts = {} row_result = [] while True: input_ = sys.stdin.readline() if input_ == '' : break else : input_ = input_.strip() if input_ == '' : continue result = jumanpp.analysis(input_) for mrph in result.mrph_list(): if not repname_counts.has_key(mrph.repname): repname_counts[mrph.repname] = 0 if (not mrph.midasi in midasis) and (mrph.repname != u"") : repname_counts[mrph.repname] += 1 w_r = get_wikipedia_redirection(mrph.imis) if not w_r : w_r = mrph.midasi if not w_r_counts.has_key(w_r): w_r_counts[w_r] = 0 if (not mrph.midasi in midasis): w_r_counts[w_r] += 1 midasis.append(mrph.midasi) repnames.append(mrph.repname) wikipedia_redirections.append(w_r) w_rs.append(w_r) midasis.append("\n") repnames.append("\n") wikipedia_redirections.append(None) w_rs.append("\n") repname_counts["\n"] = 0 w_r_counts["\n"] = 0 row_result.append(result.spec()) yure_result = [] for i, midasi in enumerate(midasis): yure = False if repname_counts[repnames[i]] > 1 or w_r_counts[w_rs[i]] > 1: yure = True yure_result.append({"midasi":midasi, "repname": repnames[i], "wikipedia_redirection": wikipedia_redirections[i], "repname_count": repname_counts[repnames[i]], "w_r_count": w_r_counts[w_rs[i]], "yure": yure}) return row_result, yure_result
def segment(texts): jumanpp = Jumanpp() results = {} for text in texts: try: parsed = jumanpp.analysis(han_to_zen(neologdn.normalize(text))) segmented = ' '.join(m.midasi for m in parsed.mrph_list()) results[text] = segmented except Exception: pdb.set_trace() logger.warning('Cannot parse {}'.format(text)) continue return results
def parser_juman(text): from pyknp import Jumanpp jumanpp = Jumanpp() result = jumanpp.analysis(text) words = [] for n in result.mrph_list(): if n.hinsi != '助詞' and n.hinsi != '助動詞' and n.hinsi != '特殊' and n.bunrui != "空白": if n.hinsi == '動詞': words.append(n.genkei) else: words.append(n.midasi) return words
def segment_ja(texts, flag_keep_number=False): jumanpp = Jumanpp() results = {} for text in texts: try: parsed = jumanpp.analysis(han_to_zen(text)) if flag_keep_number: segmented = ' '.join(m.midasi for m in parsed.mrph_list()) else: segmented = ' '.join('<数詞>' if m.bunrui == '数詞' else m.midasi for m in parsed.mrph_list()) results[text] = segmented except Exception: pdb.set_trace() logger.warning('Cannot parse {}'.format(text)) continue return results
def main(): model_w2v = gensim.models.KeyedVectors.load_word2vec_format( "/share/data/word2vec/2016.08.02/w2v.midasi.256.100K.bin", binary=True, unicode_errors='ignore') word2index = {w: i for i, w in enumerate(model_w2v.index2word)} model = BiLSTM(embed_mat=model_w2v.vectors, mid_size=128) serializers.load_npz("BiLSTM_attention.model", model) # 標準入力からテストできるように jumanpp = Jumanpp() while True: input_sentence = sys.stdin.readline() # 改行を含む, string型 result = jumanpp.analysis(input_sentence) doc = [mrph.midasi for mrph in result.mrph_list()] x = [doc2list(doc, word2index)] # x = list2Var([doc2vec(doc)], np.float32, False) with chainer.using_config("train", False): y, attn_list = model.predict(x) p = np.argmax(y[0].data) doc_class = ["新聞記事", " 雑誌 ", " 教科書 ", " ブログ "] print("") print("*------------------------*") print("| |") print("| " + doc_class[p] + " |") print("| |") print("*------------------------*") print("") prob = F.softmax(y, axis=1)[0].data print("新聞記事: {:.6f} 雑誌: {:.6f} 教科書: {:.6f} ブログ: {:.6f}".format( prob[0], prob[1], prob[2], prob[3])) for word, attn in sorted(zip(doc, attn_list), key=lambda x: x[1], reverse=True): print(word, end=", ") print("\n")
def parse(line): if line == None: return if line == "\n": return jumanpp = Jumanpp() replaced = re.sub('\n|\u3000| ', '', line) result = jumanpp.analysis(replaced) words = [] for mrph in result.mrph_list(): if not mrph == None: print('{0} 読み: {1} 品詞: {2} 活用1: {3} 活用2: {4}'. format(mrph.midasi, mrph.yomi, mrph.hinsi, mrph.katuyou1, mrph.katuyou2)) words.append(mrph.midasi) return words
class JumanParser(Parser): def __init__(self): super().__init__() remove_pattern = r'・|、|\,|\.| | ' self.remove_compiled = re.compile(remove_pattern) self.analyzer = Jumanpp() def parse(self, message): for sent in message.sentences: sent.text = self.remove_compiled.sub('', sent.text) parsed = self.analyzer.analysis(sent.text) mrph_list = parsed.mrph_list() sent.bag = self.create_bags(mrph_list) message.bags += sent.bag return message @staticmethod def create_bags(mrph_list): bag = [] for mrph in mrph_list: if mrph.hinsi == '名詞' or mrph.hinsi == '動詞': bag.append(mrph.genkei) return bag
class IntentSlotDatasetReader(DatasetReader): def __init__(self, lazy=False, max_tokens=64): super().__init__(lazy) self.token_indexers = {'tokens': SingleIdTokenIndexer()} self.max_tokens = max_tokens self.jumanpp = Jumanpp() def _read(self, file_path): with open(file_path, 'r') as f: for line in f: line = line.strip().split() label = line[-1] line = [tt.split(':') for tt in line[:-2]] text = [Token(tt[0]) for tt in line][0:self.max_tokens] tags = [tt[1] for tt in line][0:self.max_tokens] yield self.text_to_instance(text, label, tags) def tokenizer(self, text): text = [ Token(mrph.midasi) for mrph in self.jumanpp.analysis(text).mrph_list() ][0:self.max_tokens] return text def text_to_instance(self, text, label=None, tags=None): text_field = TextField(text, self.token_indexers) fields = {'text': text_field} if label: label_field = LabelField(label, label_namespace='labels') fields['label'] = label_field if tags: tags_field = SequenceLabelField(tags, text_field, label_namespace='tags') fields['tag'] = tags_field return Instance(fields)
# -*- coding: utf-8 -*- from pyknp import Jumanpp import sys import codecs j = Jumanpp() line = sys.stdin.readline() if sys.version[0] == str(2): result = j.analysis(line.decode("utf-8")) else: result = j.analysis(line) for mrph in result.mrph_list(): print(mrph.midasi)
def main(): Topic = [] Utterance = [] Relevance = [] ID = [] regex = u'[^ぁ-ん]+' #学習用データ form[label, Topic & Utterce] #wf_Data = open("Tpc&UTRtEST.csv","w") all_filepaths = glob.glob('./testGS/*') for filepath in all_filepaths: lines = [ line.rstrip() for line in fileinput.input( filepath, openhook=fileinput.hook_encoded('utf-8')) ] # JSON全体の文法チェック try: arguments = json.loads('\n'.join(lines)) except json.JSONDecodeError as e: print('エラーあり') print(e) exit(1) # Display title #print(arguments[0]["Topic"]) for argument in arguments: ID.append(argument["ID"]) Topic.append(argument["Topic"]) Utterance.append(argument["Utterance"]) Relevance.append(argument["Relevance"]) TrueDataset = {} correctAnswer_0 = 0 correctAnswer_1 = 0 incorrectAnswer_0 = 0 incorrectAnswer_1 = 0 for line in list(set(Utterance)): T_List = [] R_list = [] id_tag = 0 for line_l in range(len(Utterance)): if line == Utterance[line_l]: T_List.append(Topic[line_l]) R_list.append(Relevance[line_l]) id_tag = ID[line_l] TrueDataset[Counter(T_List).most_common()[0][0] + ":" + line + ":" + id_tag] = str(Counter(R_list).most_common()[0][0]) sorted(TrueDataset.items()) # Analyze Utterance using Juman++ & knp jumanpp = Jumanpp() with open("CommonWords.csv", "w") as wf: wf.write("label,A,B\n") line_cnt = len(TrueDataset) now_line_cnt = 0 for key, label in TrueDataset.items(): tpc, utr, id = key.split(":")[0], key.split(":")[1], key.split( ":")[2] topANDutrANDlabelList = [] #parse Topic topic_analyed_List = [] topANDutrANDlabelList.append("Topic") try: #0.7909880035111675 #s = tpc.split("を")[-2] + "を" + tpc.split("を")[-1].split("べきである")[0] #topic_result = jumanpp.analysis(s) topic_result = jumanpp.analysis(format_text(tpc)) #print(s) for mrph in topic_result.mrph_list(): try: if len(re.findall(regex, mrph.genkei)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): if "数量" in mrph.imis: topic_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append("[数]") else: topic_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append(mrph.genkei) except: continue except: continue #parse Utterance utter_analyed_List = [] topANDutrANDlabelList.append("Utterance") try: if "、" in utr: utrList = utr.split("、") for sentence in utrList: #reigi if sentence == "": continue utter_result = jumanpp.analysis(sentence) for mrph in utter_result.mrph_list(): try: if len(re.findall(regex, mrph.genkei)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): if "数量" in mrph.imis: utter_analyed_List.append( mrph.genkei) topANDutrANDlabelList.append("[数]") else: utter_analyed_List.append( mrph.genkei) topANDutrANDlabelList.append( mrph.genkei) else: continue except: print("error") continue else: utter_result = jumanpp.analysis(utr) for mrph in utter_result.mrph_list(): try: if len(re.findall(regex, mrph.genkei)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): if "数量" in mrph.imis: utter_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append("[数]") else: utter_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append( mrph.genkei) except: print("error") continue topANDutrANDlabelList.append("END") except: print("error") continue #if "END" in topANDutrANDlabelList: #wf_Data.write(str(label) + "," + " ".join(topANDutrANDlabelList[:-1]) + "\n")#+ " [---] " + "{\"ID\":\"" + id + "\",\"Topic\":\"" + tpc + "\",\"Utterance\":\"" + utr + "\",\"Relevance\":\"" + "null" + "\",\"Fact-checkability\":null,\"Stance\":null,\"Class\":null}\n") # if "END" in topANDutrANDlabelList: # wf_Data.write(str(label) + "," + " ".join(topANDutrANDlabelList[:-1])+ " [---] " + "{\"ID\":\"" + id + "\",\"Topic\":\"" + tpc + "\",\"Utterance\":\"" + utr + "\",\"Relevance\":\"" + "null" + "\",\"Fact-checkability\":null,\"Stance\":null,\"Class\":null}\n") if (len(set(topic_analyed_List) & set(utter_analyed_List)) > 0): #wf.write("{\"ID\":\"" + id + "\",\"Topic\":\"" + tpc + "\",\"Utterance\":\"" + utr + "\",\"Relevance\":\"" + "1" + "\",\"Fact-checkability\":null,\"Stance\":null,\"Class\":null},\n") wf.write(str(label) + ",1," + str(1) + "\n") else: wf.write(str(label) + ",1," + str(0) + "\n") #wf.write("{\"ID\":\"" + id + "\",\"Topic\":\"" + tpc + "\",\"Utterance\":\"" + utr + "\",\"Relevance\":\"" + "0" + "\",\"Fact-checkability\":null,\"Stance\":null,\"Class\":null},\n") # if (len(set(topic_analyed_List) & set(utter_analyed_List)) > 0): # if int(label) == 1: # correctAnswer_1 += 1 # else: # incorrectAnswer_1 += 1 # # else: # if int(label) == 0: # correctAnswer_0 += 1 # else: # incorrectAnswer_0 += 1 now_line_cnt += 1 print(now_line_cnt, len(TrueDataset), line_cnt) correctAnswer = correctAnswer_0 + correctAnswer_1 print(correctAnswer * 1.0 / now_line_cnt, " ans0:", correctAnswer_0, " ans1:", correctAnswer_1, " miss:", now_line_cnt - correctAnswer) print( "詳細:", "p0t0", correctAnswer_0, "p0t1", incorrectAnswer_0, "p1t0", incorrectAnswer_1, "p1t1", correctAnswer_1, ) label_cnt = 0 for text, label in TrueDataset.items(): if int(label) == 1: label_cnt += 1 print(label_cnt / len(TrueDataset))
print("Processing Text:{}".format(i)) if s == "": continue result = jumanpp.analysis(s) midasi_lst = [] for w in result.mrph_list(): midasi_lst.append([w.midasi.replace("_"," "),"O"]) t_midasi.append(midasi_lst) """ print("-----------------") for i, s in enumerate(wlist): print("Processing Word:{}".format(i)) if s == "": continue result = jumanpp.analysis(s) midasi_lst = [] for w in result.mrph_list(): midasi = w.midasi.replace("_", "") if midasi == "": continue midasi_lst.append(midasi) w_midasi.append(midasi_lst) """ with open("./text_midasi.list","wb") as f: pickle.dump(t_midasi,f) """ with open("./word_midasi.list", "wb") as f: pickle.dump(w_midasi, f)
#-*- encoding: utf-8 -*- from pyknp import Jumanpp import sys import codecs # sys.stdin = codecs.getreader('utf_8')(sys.stdin) # sys.stdout = codecs.getwriter('utf_8')(sys.stdout) # Use Juman++ in subprocess mode jumanpp = Jumanpp() result = jumanpp.analysis(u"ケーキを食べる") for mrph in result.mrph_list(): print("見出し:{0}".format(mrph.midasi))
def main(): print("fsovs") Topic = [] Utterance = [] Relevance = [] regex = u'[^ぁ-ん]+' #学習用データ form[label, Topic & Utterce] wf_Data = open("Tpc&UTR_Stance.csv","w") all_filepaths=glob.glob('./training/*') for filepath in all_filepaths: lines = [line.rstrip() for line in fileinput.input( filepath, openhook=fileinput.hook_encoded('utf-8'))] # JSON全体の文法チェック try: arguments = json.loads('\n'.join(lines)) except json.JSONDecodeError as e: print('エラーあり') print(e) exit(1) # Display title #print(arguments[0]["Topic"]) for argument in arguments: Topic.append(argument["Topic"]) Utterance.append(argument["Utterance"]) Relevance.append(argument["Stance"]) TrueDataset = {} correctAnswer_0 = 0 correctAnswer_1 = 0 for line in list(set(Utterance)): T_List = [] R_list = [] for line_l in range(len(Utterance)): if line == Utterance[line_l]: T_List.append(Topic[line_l]) R_list.append(Relevance[line_l]) TrueDataset[Counter(T_List).most_common()[0][0] + ":" + line] = str(Counter(R_list).most_common()[0][0]) sorted(TrueDataset.items()) # Analyze Utterance using Juman++ & knp jumanpp = Jumanpp() with open("incorrectTrus.txt","w") as wf: line_cnt = len(TrueDataset) now_line_cnt = 0 for key, label in TrueDataset.items(): tpc,utr = key.split(":")[0],key.split(":")[1] topANDutrANDlabelList = [] #parse Topic topic_analyed_List = [] topANDutrANDlabelList.append("Topic") try: #0.7909880035111675 #s = tpc.split("を")[-2] + "を" + tpc.split("を")[-1].split("べきである")[0] #topic_result = jumanpp.analysis(s) topic_result = jumanpp.analysis(format_text(tpc)) #print(s) for mrph in topic_result.mrph_list(): try : if len(re.findall(regex, mrph.genkei)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): if "数量" in mrph.imis: topic_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append("[数]") else: topic_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append(mrph.genkei) except: continue except: continue #parse Utterance utter_analyed_List = [] topANDutrANDlabelList.append("Utterance") try: if "、" in utr: utrList = utr.split("、") for sentence in utrList: #reigi if sentence == "": continue utter_result = jumanpp.analysis(sentence) for mrph in utter_result.mrph_list(): try : if len(re.findall(regex, mrph.genkei)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): if "数量" in mrph.imis: utter_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append("[数]") else: utter_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append(mrph.genkei) else: continue except: print("error") continue else: utter_result = jumanpp.analysis(utr) for mrph in utter_result.mrph_list(): try : if len(re.findall(regex, mrph.genkei)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): if "数量" in mrph.imis: utter_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append("[数]") else: utter_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append(mrph.genkei) except: print("error") continue topANDutrANDlabelList.append("END") except: print("error") continue if "END" in topANDutrANDlabelList: #print(topANDutrANDlabelList) wf_Data.write(str(label) + "," + " ".join(topANDutrANDlabelList[:-1])+"\n") #print((set(topic_analyed_List) & set(utter_analyed_List)),len(set(topic_analyed_List) & set(utter_analyed_List))) #if (len(set(topic_analyed_List) & set(utter_analyed_List)) > 0): #print("1:",label) if int(label) == 1: wf.write(tpc + ":" + utr + "[" + "1" + ":" +label + "]\n") elif int(label) == 2: wf.write(tpc + ":" + utr + "[" + "2" + ":" +label + "]\n") else: wf.write(tpc + ":" + utr + "[" + "0" + ":" +label + "]\n")
try: a = str(row[1]) text_list.append(a) except: print('read_fail') load_miss_count += 1 f.close() num = 0 for line in text_list: text = '' num += 1 try: result = jumanpp.analysis(line) print(num) for mrph in result.mrph_list(): hinsi = mrph.hinsi if(hinsi == "" or hinsi == "" or hinsi == "" or hinsi == ""): text += str(mrhp.midasi) + ' ' outF.write(str(text) + '\n') except (AttributeError, TypeError, KeyError, ValueError): print("missing_value") analyze_miss_count += 1 continue outF.close() print(load_miss_count) print(analyze_miss_count)
def main(): all_filepaths = glob.glob('./training/*') #print("frhifr",all_filepaths) Topic = [] Utterance = [] Relevance = [] FactCheck = [] Stance = [] for filepath in all_filepaths: # args = get_args() # JSON読み込み # src = '-' if not hasattr(args, 'json_file') else args.json_file lines = [ line.rstrip() for line in fileinput.input( filepath, openhook=fileinput.hook_encoded('utf-8')) ] # JSON全体の文法チェック try: arguments = json.loads('\n'.join(lines)) except json.JSONDecodeError as e: print('エラーあり') print(e) exit(1) # Display title #print(arguments[0]["Topic"]) for argument in arguments: Topic.append(argument["Topic"]) Utterance.append(argument["Utterance"]) Relevance.append(argument["Relevance"]) FactCheck.append(argument["Fact-checkability"]) Stance.append(argument["Stance"]) TrueDataset = [] for line in list(set(Utterance)): cnt = 0 R_list = [] F_list = [] S_list = [] for line_l in range(len(Utterance)): if line == Utterance[line_l]: cnt += 1 R_list.append(Relevance[line_l]) F_list.append(FactCheck[line_l]) S_list.append(Stance[line_l]) plane = line + " " + str( Counter(R_list).most_common()[0][0]) + " " + str( Counter(F_list).most_common()[0][0]) + " " + str( Counter(S_list).most_common()[0][0]) if not ((cnt == 5 and Counter(S_list).most_common()[0][1] == 2) or (cnt == 3 and Counter(S_list).most_common()[0][1] == 1)): TrueDataset.append(plane) # Analyze Utterance using Juman++ jumanpp = Jumanpp() for arguments in TrueDataset: #print(argument["Utterance"],argument["Relevance"],argument["Fact-checkability"],argument["Stance"],argument["Class"]) argument = arguments.split(" ") result = jumanpp.analysis(argument[0]) analyed_argument = "" for mrph in result.mrph_list(): if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): analyed_argument += mrph.midasi + " " analyed_argument += "\t" analyed_argument += argument[1] + "\t" analyed_argument += argument[2] + "\t" analyed_argument += argument[3] print(analyed_argument)
#-*- encoding: utf-8 -*- from pyknp import Jumanpp import sys import codecs jumanpp = Jumanpp() f = codecs.open(sys.argv[1], 'r', 'utf-8') for line in f: text = line.rstrip() try: result = jumanpp.analysis(text) tokens = [mrph.midasi for mrph in result.mrph_list()] print('\t'.join(tokens)) except ValueError: print('VALUE ERROR') f.close # # JUMAN++をsubprocessモードで使用 # jumanpp = Jumanpp() # result = jumanpp.analysis(u"ケーキを食べる") # for mrph in result.mrph_list(): # print(u"見出し:{0}".format(mrph.midasi))
help="classify text", type=str, default="日本でのビジネス") parser.add_argument("--path_to_model", help="model to use", type=str, default="./models/my-model.ckpt") args = parser.parse_args() jumanpp = Jumanpp() classify_data = [] vocab = Vocabulary("data_use.txt") result = jumanpp.analysis(args.input_text) for mrph in result.mrph_list(): word = mrph.midasi classify_data.append(vocab.stoi(word)) classify_data = data_helper.pad_one(classify_data, 256, 0) with open("training_config.json") as f: params = json.load(f) embedding_mat = np.load("./models/embedding.npy") session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_config) with sess.as_default():
def main(): Topic = [] Utterance = [] Relevance = [] regex = u'[^ぁ-ん]+' all_filepaths=glob.glob('./testGS/*') for filepath in all_filepaths: lines = [line.rstrip() for line in fileinput.input( filepath, openhook=fileinput.hook_encoded('utf-8'))] # JSON全体の文法チェック try: arguments = json.loads('\n'.join(lines)) except json.JSONDecodeError as e: print('エラーあり') print(e) exit(1) # Display title #print(arguments[0]["Topic"]) for argument in arguments: Topic.append(argument["Topic"]) Utterance.append(argument["Utterance"]) Relevance.append(argument["Relevance"]) TrueDataset = {} correctAnswer = 0 for line in list(set(Utterance)): T_List = [] R_list = [] for line_l in range(len(Utterance)): if line == Utterance[line_l]: T_List.append(Topic[line_l]) R_list.append(Relevance[line_l]) TrueDataset[Counter(T_List).most_common()[0][0] + ":" + line] = str(Counter(R_list).most_common()[0][0]) trueCnt = 0 for key, label in TrueDataset.items(): if int(label) == 1: trueCnt += 1 print("AllTrue:",trueCnt/ len(TrueDataset)) # Analyze Utterance using Juman++ & knp jumanpp = Jumanpp() with open("incorrect_test.txt","w") as wf: line_cnt = len(TrueDataset) now_line_cnt = 0 for key, label in TrueDataset.items(): tpc,utr = key.split(":")[0],key.split(":")[1] #print(tpc + ":" + utr + "[" + label + "]") #parse Topic topic_analyed_List = [] try: #0.7909880035111675 #s = tpc.split("を")[-2] + "を" + tpc.split("を")[-1].split("べきである")[0] #topic_result = jumanpp.analysis(s) topic_result = jumanpp.analysis(format_text(tpc)) #print(s) for mrph in topic_result.mrph_list(): try : if len(re.findall(regex, mrph.midasi)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): topic_analyed_List.append(mrph.midasi) except: continue except: #print("Error.",tpc) continue #parse Utterance utter_analyed_List = [] try: utter_result = jumanpp.analysis(utr) for mrph in utter_result.mrph_list(): try : if len(re.findall(regex, mrph.midasi)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): utter_analyed_List.append(mrph.midasi) except: continue except: #print("Error.",utr) continue #print((set(topic_analyed_List) & set(utter_analyed_List)),len(set(topic_analyed_List) & set(utter_analyed_List))) if (len(set(topic_analyed_List) & set(utter_analyed_List)) > 0): #print("1:",label) if int(label) == 1: correctAnswer += 1 else: wf.write(tpc + ":" + utr + "[" + "1" + ":" +label + "]\n") else: #print("0:",label) if int(label) == 0: correctAnswer += 1 else: wf.write(tpc + ":" + utr + "[" + "0" + ":" +label + "]\n") now_line_cnt += 1 #print( now_line_cnt, "/", line_cnt) print("acurracy:",correctAnswer*1.0 / len(TrueDataset))
# 学習開発データの作成 with open('data/training_data_sample.json', 'r') as f: corpus = json.load(f) for key in corpus: with open('data/%s' % key, 'w') as f: for data in corpus[key]: text = data['text'].translate(han2zen) # 形態素解析処理 mrphs = [ mrph.midasi for mrph in jumanpp.analysis(text).mrph_list() ] # 文字位置から単語位置への変換辞書の作成 c2w = {} c = 0 w = 0 for mrph in mrphs: for i in range(len(mrph)): c2w[c] = w c += 1 w += 1 # スロット列の作成