예제 #1
0
from k54 import load_xml
import re

# sentence_list[sentenceIndex][tokenIndex]で任意の単語が取得できるようなリストを作成
sentence_list = [[]]  # インデックス調整の為予め1つ要素を入れておく

for sentence in load_xml()['root']['document']['sentences']['sentence']:
    token_list = ['']  # インデックス調整の為(ry

    # 13文目の為の処理
    if isinstance(sentence['tokens']['token'], dict):
        token_list.append(sentence['tokens']['token']['word'])

    else:
        for token in sentence['tokens']['token']:
            token_list.append(token['word'])
    sentence_list.append(token_list)

for core in load_xml()['root']['document']['coreference']['coreference']:
    rep = ''
    for mention in core['mention']:
        if '@representative' in mention:  # 代表参照表現なら文字列を取得
            rep = (mention['text'])
        else:
            sent = int(mention['sentence'])
            start = int(mention['start'])
            end = int(mention['end'])

            # 参照表現の先頭に '「 [代表表現] ( ' を追加
            sentence_list[sent][
                start] = '「 ' + rep + ' ( ' + sentence_list[sent][start]
예제 #2
0
from k54 import load_xml


def print_person_name(word='', pos='', ner=''):
    if pos == 'NNP' and ner == 'PERSON':
        print(word)


for sentence in load_xml()['root']['document']['sentences']['sentence']:
    tokens = sentence['tokens']['token']

    # tokenが1つしか無い時の処理
    if isinstance(tokens, dict):
        print_person_name(tokens['word'], tokens['POS'], tokens['NER'])
        continue

    for t in tokens:
        print_person_name(t['word'], t['POS'], t['NER'])
예제 #3
0
파일: k59.py 프로젝트: kamuiroeru/NLP_Knock
from k54 import load_xml
import re

parse_strings = [sentences['parse'] for sentences in load_xml()['root']['document']['sentences']['sentence']]

for ps in parse_strings:
    start = 0  # (NP の開始位置を保存
    while len(ps) > start:
        start += 1

        if ps[start:start + 3] == '(NP':  # (NPが見つかった時

            # 終端位置endを探す、countは()の数をカウント
            end, count = start + 1, 1
            while count:  # count が0以上の間
                if ps[end] == '(':
                    count += 1
                elif ps[end] == ')':
                    count -= 1
                end += 1

            outList = []
            # 先頭と終端がわかったので、中に入っている単語だけ抜き取る
            for word in ps[start:end + 1].split(' '):
                if word and word[-1] == ')':
                    outList.append(word.replace(')', ''))

            outStr = re.sub(r' ([,.;:?!])', r'\1', ' '.join(outList))  # カンマやピリオドの前の空白を削除
            outStr = outStr.replace('-LRB- ', '(').replace(' -RRB-', ')')  # ()を復元

            print(outStr)