def __init__(self, command='knp', option='-tab', rcfile='', server=None, port=31000, timeout=30, pattern=r'(?:^|\n)EOS($|\n)', jumanrcfile='', juman_option='-e2 -B', juman_port=32000, juman_command='juman', jumanpp=False): self.use_jumanpp = (juman_command == "jumanpp") or jumanpp assert 'EOS' in pattern self.pattern = pattern self.EOS = 'EOS' # tab形式しかパースしない assert '-tab' in option if rcfile and not os.path.isfile(os.path.expanduser(rcfile)): sys.stderr.write("Can't read rcfile (%s)!\n" % rcfile) quit(1) # Setup Juman(++) assert port != juman_port juman_args = {'option': juman_option, 'rcfile': jumanrcfile, 'server':server, 'port':juman_port} if self.use_jumanpp: self.juman = Jumanpp(**juman_args) else: self.juman = Juman(**juman_args) # Setup KNP if server is not None: self.socket = Socket(server, port, option=option, timeout=timeout) self.query = partial(self.socket.query, pattern=pattern) else: if rcfile: option += " -r {}".format(rcfile) self.subprocess = Subprocess(command, option=option) self.query = partial(self.subprocess.query, pattern=pattern)
def analyzer(): bc = BertClient(ip='bertserving', output_fmt='list') client = Elasticsearch('elasticsearch:9200') texts = [] list_text = [] jumanpp = Juman() query = request.args.get('q') result = jumanpp.analysis(query) for mrph in result.mrph_list(): texts.append(mrph.midasi) list_text.append(" ".join(texts)) query_vector = bc.encode(list_text, is_tokenized=False)[0] script_query = { "script_score": { "query": { "match": { "source": "tb" } }, "script": { "source": "cosineSimilarity(params.query_vector, doc['question_vector']) + 1.0", "params": { "query_vector": query_vector } } } } response = client.search(index=INDEX_NAME, body={ "size": SEARCH_SIZE, "query": script_query }) return jsonify(response)
def __init__(self): """Constructs a Tokenizer for Juman++. """ from pyknp import Juman self.do_lower_case = False self._jumanpp = Juman()
def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='juman', jumanrcfile='', jumanpp=False): self.command = command self.server = server self.port = port self.timeout = timeout self.option = option self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = (jumancommand == "jumanpp") or jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): sys.stderr.write("Can't read rcfile (%s)!\n" % self.rcfile) quit(1) if (self.jumanpp): self.juman = Jumanpp() else: self.juman = Juman(command=jumancommand, rcfile=jumanrcfile)
def _apply_jumanpp(self, inp: str) -> Tuple[str, str]: jumanpp = Juman(command=self.juman, option=self.juman_option) jumanpp_result = jumanpp.analysis(inp) jumanpp_out = jumanpp_result.spec() + 'EOS\n' jumanpp_conll_out = self._jumanpp2conll_one_sentence( jumanpp_out) + 'EOS\n' return jumanpp_out, jumanpp_conll_out
def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='jumanpp', jumanrcfile='', jumanoption='', jumanpp=True): self.command = command self.server = server self.port = port self.timeout = timeout self.options = option.split() self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(self.command) is None: raise Exception("Can't find KNP command: %s" % self.command) self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, option=jumanoption, jumanpp=self.jumanpp)
def main(DATA_ROOT): text_files = Path(DATA_ROOT).glob('**/*.txt') for text_file in text_files: with open(text_file) as f: content = f.read() content = re.sub(r"=+(.*?)=+", "\g<1>", content) content = re.sub(r"^\n", "", content, flags=re.MULTILINE) content = content.replace('<block>', '') content = content.replace('<math-element>', '') # In this case, 。 can be removed safely sentences = re.split(r"[。\n]", content) sentences = [line for line in sentences if len(line) != 0] sentences = [''.join(line.split()) for line in sentences] # Remove sentence which is not properly parsed val_sentences = [] offsets = [] juman = Juman() for sentence in tqdm(sentences): # Try to parse try: result = juman.analysis(sentence) except ValueError: print(sentence) except Exception as e: raise e current = 0 offset = [0 for _ in range(len(sentence))] for mrph in result.mrph_list(): current = current + len(mrph.midasi) try: offset[current - 1] = 1 except IndexError as e: print(sentence) print(current) for _mrph in result.mrph_list(): print(_mrph.midasi) raise e except Exception as e: raise e val_sentences.append(sentence) offsets.append(offset) results = (sentences, offsets) file_name = text_file.name[:-4] + '.pickle' dic = text_file.parent with open(Path(dic, file_name), 'wb') as f: pickle.dump(results, f)
def run(self): data = self.load() jumanpp = Juman() output = [] for _, row in data.iterrows(): zenkaku = jaconv.h2z(row["sentence"], ascii=True, digit=True) splited = [ mrph.midasi for mrph in jumanpp.analysis(zenkaku).mrph_list() ] if self.task_name == 'QA_B': qa_zenkaku = jaconv.h2z( f"{row['target']}の{row['aspect']}は{row['sentiment']}", ascii=True, digit=True, ) else: qa_zenkaku = " " qa_splited = [ mrph.midasi for mrph in jumanpp.analysis(qa_zenkaku).mrph_list() ] output.append({ "context": " ".join(splited), "qa": " ".join(qa_splited), "label": 1 }) self.dump(pd.DataFrame(output))
def initialize(fword, tword, modelfn, start, debug): juman = Juman() # parse and check from_word ms_f = juman.analysis(fword).mrph_list() if len(ms_f) > 1: print(u'{} is parsed multiple words'.format(fword)) exit(1) wm_f = ms_f[0] if not wm_f.repname: print(u'no repname with {}'.format(fword)) exit(1) fword = wm_f.repname # parse and check to_word ms_t = juman.analysis(tword).mrph_list() if len(ms_t) > 1: print(u'{} is parsed multiple words'.format(tword)) exit(1) wm_t = ms_t[0] if not wm_t.repname: print(u'no repname with {}'.format(tword)) exit(1) tword = wm_t.repname # load and check model print(u'loading model...') if modelfn.split('.')[-1] == 'model': model = Word2Vec.load(modelfn) elif modelfn.split('.')[-1] == 'bin': model = Word2Vec.load_word2vec_format(modelfn, binary=True, unicode_errors='ignore') if fword not in model.vocab: raise KeyError(u'{} is not found in the model'.format(fword)) exit(1) elif tword not in model.vocab: raise KeyError(u'{} is not found in the model'.format(tword)) exit(1) model.save('hs0.100m.500.5.18mgt100.model') t1 = time.clock() - start if debug: printtime(t1) print(u'constructing id2vocab map...') id2vocab = {} for i, v in enumerate(model.vocab): id2vocab[i] = v t2 = time.clock() - t1 if debug: printtime(t2) print(u'constructing V...') V = [] for v in model.vocab: V.append(model[v]) V = np.vstack(V) t3 = time.clock() - t2 if debug: printtime(t3) return fword, tword, model, V, id2vocab, t3
def string_word_point(self, df): jumanpp = Juman(jumanpp=False) tmp_word =[] df_time_word = pd.DataFrame(index=[], columns=['time','word']) #単語と時間のdf df_word_point = pd.DataFrame(index=[], columns=['word','point'])#単語とその出現数のdf df_time_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時のコメント数のdf df_time_www_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時のwww数のdf df_time_hakusyu_point = pd.DataFrame(index=[], columns=['time','point'])#時間とその時の拍手数のdf df_URL_point = pd.DataFrame(index=[], columns=['URL','point'])#URLまとめdf #print(df_word_point) for i in range(len(df)): #URLだったら追加 url=URL_hanbetu(df['comment'][i]) if url != False: tmp = self.my_index(df_URL_point['URL'],url) df_URL_point = self.make_df_append(df_URL_point,tmp,url) #print("記号削除前") #print(df_word_point) #記号削除中 print(df['comment'][i]) df['comment'][i] = self.my_delete(df['comment'][i]) # h:m:s -> hms に変更 tmp_time = self.strtime_to_inttime(df['time'][i]) #時間ごとのコメント数計算 tmp = self.my_index(df_time_point['time'],tmp_time) df_time_point = self.make_df_append(df_time_point,tmp,tmp_time) #wwwがあったら1追加なかったら0追加 print(url) if False != self.www_hanbetu(df['comment'][i]) and url == False: df_time_www_point = self.make_df_append(df_time_www_point,tmp,tmp_time) else: if False == tmp : df_time_www_point = df_time_www_point.append({'time': tmp_time, 'point': 0}, ignore_index=True) #拍手があったら1追加なかったら0追加 if False != self.hakusyu_hanbetu(df['comment'][i]): df_time_hakusyu_point = self.make_df_append(df_time_hakusyu_point,tmp,tmp_time) else: if False == tmp : df_time_hakusyu_point = df_time_hakusyu_point.append({'time': tmp_time, 'point': 0}, ignore_index=True) #構文解析 result = jumanpp.analysis(df['comment'][i]) #print(result) #分析結果からdf作成 for token in result.mrph_list(): tmp_word = token.midasi #名詞の出現数計算 if 0 != self.word_Classification(token.hinsi): #名詞なら if self.word_Classification(token.hinsi) == '名詞': tmp = self.my_index(df_word_point['word'],tmp_word) df_word_point = self.make_df_append(df_word_point,tmp,tmp_word) #名詞とその時の時間 df_time_word = df_time_word.append({'time':tmp_time,'word': tmp_word}, ignore_index=True) return df_time_word,df_word_point,df_time_point,df_time_www_point, df_time_hakusyu_point,df_URL_point
def __init__(self, bert_model: str, fine_tuned_model: str, jumanpp_command: str): self.jumanpp = Juman(command=jumanpp_command) self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=False) self.model = BertPosNegClassifier(bert_model) state_dict = torch.load(fine_tuned_model, map_location=torch.device('cpu')) self.model.load_state_dict({k.replace('module.', ''): v for k, v in state_dict.items()}) self.model.eval()
def __init__( self, preprocessor=None, stopwords=[], ): self.jumanpp = Juman() self.preprocessor = preprocessor self.stopwords = stopwords
def counter(text, d): jumanapp = Juman() result = jumanapp.analysis(text) for mrph in result.mrph_list(): if mrph.genkei in d: d[mrph.genkei] = d[mrph.genkei] + 1 else: d[mrph.genkei] = 1
def morphological_analysis(self, text): jumanpp = Juman() ret = [] text = self.remove_special_character(text) result = jumanpp.analysis(text) # これでスペースで単語が区切られる for mrph in result.mrph_list(): ret += self.modification(mrph.midasi) return ret
def juman_list(text): jumanpp = Juman() result = jumanpp.analysis(text) # アルファベットは全て "En" という文字列に置き換える wakati = [ mrph.genkei if mrph.bunrui != "アルファベット" else "En" for mrph in result.mrph_list() ] return ",".join(wakati)
def juman_test(): juman = Juman() print dir(juman) text = "テストテキスト" utext = unicode("".join(text.split())) print( u'"'+utext+u'"' ) juman_result = juman.analysis( utext ) for mrph in juman_result.mrph_list(): print( '> ' + mrph.midasi + ' : ' + mrph.yomi + ' : ' + mrph.genkei )
def test_juman_wrapper(self): juman = Juman() result = juman.analysis(u"これはペンです。") print(','.join(mrph.midasi for mrph in result)) for mrph in result.mrph_list(): assert isinstance(mrph, pyknp.Morpheme) print(u"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \ % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
def __init__(self): """ 日本語専用トークナイザの構築。 JUMAN++ を使用する。 """ from pyknp import Juman self.do_lower_case = False self._jumanpp = Juman()
def get_repname_using_jumanpp(genkei: str, pos: str) -> str: if pos == '助詞': return f'{genkei}/{genkei}' juman = Juman(option='-s 1') mrphs = juman.analysis(genkei, juman_format=JUMAN_FORMAT.LATTICE_TOP_ONE) # 形態素解析が誤っていないか(=1形態素になっているか)をチェック if len(mrphs) == 1: return mrphs[0].repname return f'{genkei}/{genkei}'
def __init__(self): '''initialize Examples -------- >>> nlp = JNLP() None ''' self.juman = Juman() self.KNP = KNP(option='-tab -anaphora')
def test_juman_wrapper(self): try: juman = Juman(command=self.path_to_juman_command) result = juman.analysis("これはペンです。") logger.debug(','.join(mrph.midasi for mrph in result)) for mrph in result.mrph_list(): assert isinstance(mrph, pyknp.Morpheme) logger.debug("見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \ % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname)) except ImportError: print('skip test_juman_wrapper')
def bulk_predict(docs, batch_size=256): """Predict bert embeddings.""" jumanpp = Juman(jumanpp=False) for i in range(0, len(docs), batch_size): batch_docs = docs[i: i+batch_size] pre_embedding_docs = [] for doc in batch_docs: for k in range(0, len(doc['question']), MAX_TXT_LENGTH) result = jumanpp.analysis(doc['question'][k:k+MAX_TXT_LENGTH]) texts = [mrph.midasi for mrph in result.mrph_list()] pre_embedding_docs.append(" ".join(texts)) embeddings = bc.encode(pre_embedding_docs,is_tokenized=True) yield emb
def seg2word(seg): len_split = 1000 # seg = seg_in.replace(' ', '\u3000') # seg = seg_in.replace(' ', ' ') len_seg = len(seg) seg_splits = [seg[i:i + len_split] for i in range(0, len_seg, len_split)] juman_def = Juman(command="/mnt/gold/users/s18153/bin/jumanpp") return ' '.join([ " ".join( [mrph.midasi for mrph in juman_def.analysis(seg_part).mrph_list()]) for seg_part in seg_splits ])
def juman_wakati(text, hinshi=(), DEBUG=False, STEM_FLAG=False): juman = Juman() output = "" # wakati result = juman.analysis(text) for mrph in result.mrph_list(): if STEM_FLAG and mrph.hinsi in hinshi: output += mrph.repname.split("/")[0] + " " if DEBUG: print("stem:", mrph.repname) print("midashi:", mrph.repname) print("hinsi:", mrph.hinsi) print("yomi:", mrph.yomi) return output.strip()
def __init__( self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='jumanpp', jumanrcfile='', jumanoption='', jumanpp=True, multithreading=False, ): self.command = command self.server = server self.port = port self.timeout = timeout self.options = option.split() self.rcfile = rcfile self.pattern = pattern if server is not None: self.analyzer = Analyzer(backend='socket', timeout=timeout, server=server, port=port, socket_option='RUN -tab -normal\n') else: cmds = [self.command] + self.options if self.rcfile: cmds += ['-r', self.rcfile] self.analyzer = Analyzer(backend='subprocess', multithreading=multithreading, timeout=timeout, command=cmds) self.jumanpp = jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(self.command) is None: raise Exception("Can't find KNP command: %s" % self.command) self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, option=jumanoption, jumanpp=self.jumanpp, multithreading=multithreading)
def read_home_timeline( session ): print( '[kazuha] - read timeline.' ) juman = Juman() req = session.get( twitter.API_home_timeline, params = {} ) if req.status_code == 200: timeline = json.loads( req.text ) for tweet in timeline: u_tweet_text = unicode( "".join(tweet["text"].split()) ) print( u'[kazuha] - read timeline: '+ u_tweet_text ) juman_result = juman.analysis( u_tweet_text ) for mrph in juman_result.mrph_list(): print u"%s - (%s, %s)" % (mrph.genkei, mrph.hinsi, mrph.bunrui) #end for #end for else: print( '[kazuha] - read timeline: failure.' )
class JumanTokenizer: def __init__(self, command, options): self.juman = Juman(command, options) def tokenize(self, text): result = self.juman.analysis(text) return [mrph.midasi for mrph in result.mrph_list()]
def main(bert_vocab_filepath, example_filepath, context_filepath, cache_save_dir): #Juman++ juman = Juman(jumanpp=True) logger.info("Cache files will be saved in {}.".format(cache_save_dir)) #Tokenizer logger.info("Create a tokenizer from {}.".format(bert_vocab_filepath)) tokenizer = BertTokenizer.from_pretrained(bert_vocab_filepath, do_lower_case=False) logger.info("Start loading examples from {}.".format(example_filepath)) examples = load_examples(example_filepath) logger.info("Finished loading examples.") logger.info("Number of examples: {}".format(len(examples))) logger.info("Start loading contexts from {}.".format(context_filepath)) contexts = load_contexts(context_filepath) logger.info("Finished loading contexts.") logger.info("Start encoding examples.") encoding = encode_examples(juman, tokenizer, examples, contexts, 512) logger.info("Finished encoding examples.") os.makedirs(cache_save_dir, exist_ok=True) torch.save(encoding["input_ids"], os.path.join(cache_save_dir, "input_ids.pt")) torch.save(encoding["attention_mask"], os.path.join(cache_save_dir, "attention_mask.pt")) torch.save(encoding["token_type_ids"], os.path.join(cache_save_dir, "token_type_ids.pt")) torch.save(encoding["labels"], os.path.join(cache_save_dir, "labels.pt")) logger.info("Saved cache files in {}.".format(cache_save_dir))
def title_clean(title_ls): tmp_ls = copy.deepcopy([title_ls]) for i in range(len(tmp_ls) - 1): if tmp_ls[i] is None: del tmp_ls[i] for i in range(len(tmp_ls)): tmp_ls[i] = normalize('NFKC', tmp_ls[i]) tmp_ls[i] = tmp_ls[i].replace(' ', '') tmp_ls[i] = re.sub(r'−.+?$', '', tmp_ls[i]) tmp_ls[i] = re.sub(r'ーY.+?$', '', tmp_ls[i]) tmp_ls[i] = re.sub(r'\|.+?$', '', tmp_ls[i]) jumanpp = Juman() sep_ls = [] for tmp in tmp_ls: sep_ls.append(' '.join([mrph.midasi for mrph in jumanpp.analysis(tmp)])) return sep_ls[0]
class JumanTokenizer(): def __init__(self): self.juman = Juman(jumanpp=True) def tokenize(self, text): result = self.juman.analysis(text) return [mrph.midasi for mrph in result.mrph_list()]
class JumanService(object): def __init__(self): self.__juman = Juman() def analysis(self, string): formattedString = JumanKnpUtil.format_input_string(string) return self.__juman.analysis(formattedString)
class JumanTokenizer(): _trans_tables = str.maketrans({"\"": "", "@": "@", "#": "#"}) def __init__(self, ): self.juman = Juman() def _preprocess(self, sentences): return sentences.replace(" ", "").replace("\n", "").translate(self._trans_tables) def tokenize(self, text): result = self.juman.analysis(text) return [mrph.midasi for mrph in result.mrph_list()] def _preprocess_list(self, datas): return [[x, self._preprocess(x)] for x in datas] def _tokenize_for_multi(self, datas): try: return [datas[0], self.tokenize(datas[1])] except: return [] def tokenize_multi(self, datas, thread=cpu_count()): datas = self._preprocess_list(datas) num_of_datas = len(datas) with Pool(thread) as pool: imap = pool.imap_unordered(self._tokenize_for_multi, datas) result = list(tqdm(imap, total=num_of_datas)) return result
def word_distance(s1, s2): juman = Juman() r = len(s1 + s2) - len(re.sub("[a-zA-Z0-9]", "", s1 + s2)) if r > len((s1 + s2).replace(" ", "")) // 2: return word_distance_en(s1, s2) sss = [ set( [item.midasi for item in juman.analysis(ss).mrph_list() \ if item.hinsi in {'名詞', '動詞', '形容詞', '指示詞'}\ or '内容語' in item.imis ] ) for ss in [s1, s2] ] if min(len(sss[0]), len(sss[1])) == 0: return 0 return float(len(sss[0] & sss[1])) / min(len(sss[0]), len(sss[1]))
def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='jumanpp', jumanrcfile='', jumanpp=True): self.command = command self.server = server self.port = port self.timeout = timeout self.option = option.split() self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(self.command) is None: raise Exception("Can't find KNP command: %s" % self.command) self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, jumanpp=self.jumanpp)
def __init__(self): self.juman = Juman() self.knp = KNP()
# coding: utf-8 from pyknp import Juman import sys import codecs juman = Juman() input_file = "../data/sample.txt" f = codecs.open(input_file, 'r', 'utf-8') f_out = codecs.open(input_file + '_juman_result.txt','w', 'utf-8') for line in f: result = juman.analysis(line[:-1].replace(" ", "")) #print ' '.join(mrph.midasi for mrph in result) f_out.write(' '.join(mrph.midasi for mrph in result) + '\n')
class Solver(object): def __init__(self): self.juman = Juman() self.knp = KNP() def Q61(self): u"""61. 文を標準入力から読み込み、それを単語単位に分かち書きせよ (形態素間にスペースを挿入) """ input_sentence = raw_input() result = self.juman.analysis(input_sentence.decode("utf8")) for mrph in result.mrph_list(): sys.stdout.write("{} ".format(mrph.midasi.encode("utf8"))) sys.stdout.write("\n") return def Q62(self): u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定 """ data = u"" for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) s = ",".join(mrph.midasi for mrph in result.mrph_list() if mrph.hinsi == u"名詞") # 名詞だけ表示 if len(s) > 0: print(s) data = u"" def Q63(self): u"""62. 形態素解析結果を読み込み、名詞だけを抽出してプリントせよ ヒント: mrph.hinsi が u"名詞" という文字列と一致するかどうかを判定 """ data = u"" for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) s = ",".join(mrph.genkei for mrph in result.mrph_list() if mrph.hinsi == u"動詞") # 動詞だけ表示 if len(s) > 0: print(s) data = u"" def Q64(self): u"""64. 形態素解析結果を読み込み、形態素の原形を頻度順に並べよ ヒント: ディクショナリ、sorted 関数を使う """ data = u"" hist = {} for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) for mrph in result.mrph_list(): try: hist[mrph.genkei] += 1 except KeyError: hist[mrph.genkei] = 1 data = u"" for key, val in sorted(hist.items(), key=lambda t: t[1], reverse=True): print("{},{}".format(key.encode("utf8"), val)) def Q65(self): u"""65. 形態素解析結果を読み込み、全形態素数 (総数) に対する述語の割合を計算せよ ここで、述語とは、動詞、イ形容詞 (形容詞)、ナ形容詞 (形容動詞) とする """ data = u"" num = 0 denom = 0 for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) if verbose: logger.info("denom: {}".format(denom)) for mrph in result.mrph_list(): denom += 1 if mrph.hinsi == u"動詞": num += 1 continue if mrph.hinsi == u"形容詞" and mrph.bunrui.startswith(u"イ形容詞"): num += 1 continue if mrph.hinsi == u"形容動詞" and mrph.bunrui.startswith(u"ナ形容詞"): num += 1 continue data = u"" print("{}/{}={}".format(num, denom, float(num) / denom)) def Q66(self): u"""66. 形態素解析結果を読み込み、「サ変名詞+する/できる」というパターンを抽出しプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) buff = None for mrph in result.mrph_list(): if mrph.genkei == u"できる" or mrph.genkei == u"する": if buff is not None: extract.add((buff.genkei.encode("utf8"), mrph.genkei.encode("utf8"))) if mrph.bunrui == u"サ変名詞": buff = mrph else: buff = None data = u"" for t in extract: print("{}+{}".format(t[0], t[1])) def Q67(self): u"""67. 形態素解析結果を読み込み、「AのB」という表現 (A と B は名詞の1形態素) をすべてプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): # 入力文を1行ずつ読む data += line.decode("utf8") if line.strip() == "EOS": # 1文が終わったら解析 result = self.juman.result(data) buff = [] for mrph in result.mrph_list(): if mrph.genkei == u"の" and len(buff) == 1: buff.append(u"の") continue if mrph.hinsi == u"名詞": if len(buff) == 0: buff.append(mrph.genkei) continue if len(buff) == 2: extract.add((buff[0], mrph.genkei)) buff = [] data = u"" for t in extract: print("{}の{}".format(t[0].encode("utf8"), t[1].encode("utf8"))) def Q68(self): u"""68. 文を標準入力から読み込み、それを文節単位に分かち書きせよ (文節間にスペースを挿入) """ input_sentence = raw_input() result = self.knp.parse(input_sentence.decode("utf8")) for bnst in result.bnst_list(): sys.stdout.write("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list()))) sys.stdout.write("\n") return def Q69(self): u"""69. 構文解析結果を読み込み、接頭辞を含む文節をプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): data += line.decode("utf8") if line.strip() == "EOS": result = self.knp.result(data) for bnst in result.bnst_list(): if len(filter(lambda x: x.hinsi == u"接頭辞", bnst.mrph_list())) < 1: continue extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list()))) data = u"" for bnst in extract: if len(bnst) > 0: print(bnst) return def Q70(self): u"""70. 構文解析結果を読み込み、名詞を2つ以上含む文節をプリントせよ """ data = u"" extract = set() for line in iter(sys.stdin.readline, ""): data += line.decode("utf8") if line.strip() == "EOS": result = self.knp.result(data) for bnst in result.bnst_list(): if len(filter(lambda x: x.hinsi == u"名詞", bnst.mrph_list())) < 2: continue extract.add("{} ".format("".join(mrph.midasi.encode("utf8") for mrph in bnst.mrph_list()))) data = u"" for bnst in extract: if len(bnst) > 0: print(bnst) return
class KNP(object): """ KNPを用いて構文解析を行う/KNPの解析結果を読み取るモジュール Args: command (str): KNPコマンド option (str): KNP解析オプション (詳細解析結果を出力する-tabは必須。 省略・照応解析を行う -anaphora, 格解析を行わず構文解析のみを行う -dpnd など) rcfile (str): KNP設定ファイルへのパス pattern (str): KNP出力の終端記号 jumancommand (str): JUMANコマンド jumanrcfile (str): JUMAN設定ファイルへのパス jumanpp (bool): JUMAN++を用いるかJUMANを用いるか """ def __init__(self, command='knp', server=None, port=31000, timeout=60, option='-tab', rcfile='', pattern=r'EOS', jumancommand='jumanpp', jumanrcfile='', jumanpp=True): self.command = command self.server = server self.port = port self.timeout = timeout self.option = option.split() self.rcfile = rcfile self.pattern = pattern self.socket = None self.subprocess = None self.jumanpp = jumanpp if self.rcfile and not os.path.isfile(os.path.expanduser(self.rcfile)): raise Exception("Can't read rcfile (%s)!" % self.rcfile) if distutils.spawn.find_executable(self.command) is None: raise Exception("Can't find KNP command: %s" % self.command) self.juman = Juman(command=jumancommand, rcfile=jumanrcfile, jumanpp=self.jumanpp) def knp(self, sentence): """ parse関数と同じ """ self.parse(sentence) def parse(self, sentence, juman_format=JUMAN_FORMAT.DEFAULT): """ 文字列を入力として構文解析を行い、文節列オブジェクトを返す Args: sentence (str): 文を表す文字列 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ assert(isinstance(sentence, six.text_type)) juman_lines = self.juman.juman_lines(sentence) juman_str = "%s%s" % (juman_lines, self.pattern) if not self.socket and not self.subprocess: if self.server is not None: self.socket = Socket( self.server, self.port, "RUN -tab -normal\n") else: command = [self.command] + self.option if self.rcfile: command.extend(['-r', self.rcfile]) self.subprocess = Subprocess(command) if self.socket: knp_lines = self.socket.query(juman_str, pattern=r'^%s$'%(self.pattern)) else: knp_lines = self.subprocess.query(juman_str, pattern=r'^%s$'%(self.pattern)) return BList(knp_lines, self.pattern, juman_format) def result(self, input_str, juman_format=JUMAN_FORMAT.DEFAULT): """ ある文に関するKNP解析結果を文節列オブジェクトに変換する Args: input_str (str): ある文に関するKNPの出力結果 juman_format (JUMAN_FORMAT): Jumanのlattice出力形式 Returns: BList: 文節列オブジェクト """ return BList(input_str, self.pattern, juman_format)