Exemplo n.º 1
0
 def sample_deppara_merge_b(self):
     """m_xml的な文分けをするやつのばあい、こうなってほしい
     """
     doc = nlelement.Document()
     sentence = nlelement.Sentence()
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('私', '名詞'))
     chunk.tokens.append(self.maker.token('は', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('彼', '名詞'))
     chunk.tokens.append(self.maker.token('を', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('助け', '動詞'))
     chunk.tokens.append(self.maker.token('た', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     sentence.chunks[0].link = sentence.chunks[2]
     sentence.chunks[1].link = sentence.chunks[2]
     doc.sentences.append(sentence)
     sentence = nlelement.Sentence()
     chunk = self.maker.chunk(0, 0)
     chunk.tokens.append(self.maker.token('>', '記号'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('今日', '名詞'))
     chunk.tokens.append(self.maker.token('の', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     chunk = self.maker.chunk(0, 0)
     chunk.tokens.append(self.maker.token('思い出', '名詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     sentence.chunks[1].link = sentence.chunks[2]
     doc.sentences.append(sentence)
     self.maker.set_id_to_sentences(doc)
     return doc
Exemplo n.º 2
0
    def __load_document__(self, lines):
        docs = []
        self.ids.sent.reset()
        doc = nlelement.Document()
        docs.append(doc)
        sentence = nlelement.Sentence()
        sentence.sid = self.ids.sent.get()
        self.ids.chunk.reset()
        self.ids.tok.reset()
        self.entity_ids = dict()

        chunk = None
        for self.line_num, self.line in enumerate(
                map(lambda x: x.rstrip('\r\n'), lines)):
            if self.line == 'EOT':
                self.__resolve_entity_id__(doc)
                self.ids.sent.reset()
                self.entity_ids = dict()
                doc = nlelement.Document()
                docs.append(doc)
            elif self.line == 'EOS':
                # NOTE: もとになってるcabochaモジュールでsidが永遠に付与されないという深すぎる闇があった
                for tok in sentence.tokens:
                    tok.sid = sentence.sid
                self.__validate_sentence__(sentence)
                doc.sentences.append(sentence)
                sentence = nlelement.Sentence()
                sentence.sid = self.ids.sent.get()
                chunk = None
                self.ids.chunk.reset()
                self.ids.tok.reset()
            elif self.line[0] == '#':
                if self.line[1] == '!':
                    self.__handle_comment__(self.line)
            elif self.line[0] == '*' or self.line[0] == '+':
                if chunk:
                    chunk.set_token_info()
                chunk = self.__load_chunk__(self.line, sentence.sid)

                chunk.cid = self.ids.chunk.get()
                sentence.chunks.append(chunk)
            elif len(self.line) == 0:
                pass
            else:
                token = self.__load_token__(self.line)
                token.tid = self.ids.tok.get()
                chunk.tokens.append(token)
                chunk.token_num += 1
                self.__token_post_process__(chunk, token)
                sentence.tokens.append(token)
        if doc.sentences and sentence.tokens:
            self.__add_exophora__()
            self.__resolve_entity_id__(doc)
            self.ids.sent.reset()
            self.entity_ids = dict()
        return docs
Exemplo n.º 3
0
    def sample_diffreference_converter_a(self):
        """DiffReferenceConverterのテストデータ(1)
        """
        doc = nlelement.Document()
        sentence = nlelement.Sentence()
        chunk = self.maker.chunk(0, 1)
        chunk.tokens.append(self.maker.token('アプリケーション', '名詞'))
        chunk.tokens.append(self.maker.token('は', '助詞'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        chunk = self.maker.chunk(1, 2)
        chunk.tokens.append(self.maker.token('終了', '名詞'))
        chunk.tokens.append(self.maker.token('し', '動詞'))
        chunk.tokens.append(self.maker.token('まし', '助動詞'))
        chunk.tokens.append(self.maker.token('た', '助詞'))
        chunk.tokens.append(self.maker.token('。', '記号'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        sentence.chunks[0].link = sentence.chunks[1]
        doc.sentences.append(sentence)
        
        sentence = nlelement.Sentence()
        chunk = self.maker.chunk(3, 3)
        chunk.tokens.append(self.maker.token('(', '記号'))
        chunk.tokens.append(self.maker.token('Help', '名詞'))
        chunk.tokens.append(self.maker.token(':', '名詞'))
        chunk.tokens.append(self.maker.token('H', '名詞'))
        chunk.tokens.append(self.maker.token(')', '記号'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        doc.sentences.append(sentence)

        sentence = nlelement.Sentence()
        chunk = self.maker.chunk(1, 3)
        chunk.tokens.append(self.maker.token('続行', '名詞'))
        chunk.tokens.append(self.maker.token('する', '動詞'))
        chunk.tokens.append(self.maker.token('に', '助詞'))
        chunk.tokens.append(self.maker.token('は', '助詞'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        chunk = self.maker.chunk(1, 2)
        chunk.tokens.append(self.maker.token('Esc', '名詞'))
        chunk.tokens.append(self.maker.token('キー', '名詞'))
        chunk.tokens.append(self.maker.token('を', '助詞'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        chunk = self.maker.chunk(0, 2)
        chunk.tokens.append(self.maker.token('押し', '動詞'))
        chunk.tokens.append(self.maker.token('て', '助詞'))
        chunk.tokens.append(self.maker.token('ください', '動詞'))
        chunk.tokens.append(self.maker.token('。', '記号'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        sentence.chunks[1].link = sentence.chunks[2]
        sentence.chunks[0].link = sentence.chunks[2]
        doc.sentences.append(sentence)
        
        self.maker.set_id_to_sentences(doc)
        return doc
Exemplo n.º 4
0
 def sample_pas_original(self):
     """pasのテキスト原文側用
     アノテーションがなく、一部加筆がある
     """
     doc = nlelement.Document()
     sentence = nlelement.Sentence()
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('私', '名詞'))
     chunk.tokens.append(self.maker.token('は', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('花屋', '名詞'))
     chunk.tokens.append(self.maker.token('に', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('行っ', '動詞'))
     chunk.tokens.append(self.maker.token('た', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     sentence.chunks[0].link = sentence.chunks[2]
     sentence.chunks[1].link = sentence.chunks[2]
     doc.sentences.append(sentence)
     sentence = nlelement.Sentence()
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('そこ', '名詞'))
     chunk.tokens.append(self.maker.token('に', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('美香', '名詞'))
     chunk.tokens.append(self.maker.token('が', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('い', '動詞'))
     chunk.tokens.append(self.maker.token('た', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     sentence.chunks[0].link = sentence.chunks[2]
     sentence.chunks[1].link = sentence.chunks[2]
     doc.sentences.append(sentence)
     sentence = nlelement.Sentence()
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('次回', '名詞'))
     chunk.tokens.append(self.maker.token('に', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('続き', '動詞'))
     chunk.tokens.append(self.maker.token('ます', '助動詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     sentence.chunks[0].link = sentence.chunks[1]
     doc.sentences.append(sentence)
     self.maker.set_id_to_sentences(doc)
     return doc
Exemplo n.º 5
0
    def sample_pas_annotation(self):
        """pasのアノテーション側用
        述語項関係、共参照関係に加えて文に一部
        """
        doc = nlelement.Document()
        doc.pas_annotated = True
        sentence = nlelement.Sentence()
        chunk = self.maker.chunk(0, 1)
        chunk.tokens.append(self.maker.token('私', '名詞'))
        chunk.tokens.append(self.maker.token('は', '助詞'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        chunk = self.maker.chunk(0, 1)
        chunk.tokens.append(self.maker.token('花屋', '名詞'))
        chunk.tokens.append(self.maker.token('に', '助詞'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        chunk = self.maker.chunk(0, 1)
        chunk.tokens.append(self.maker.token('行っ', '動詞'))
        chunk.tokens.append(self.maker.token('た', '助詞'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        sentence.chunks[0].link = sentence.chunks[2]
        sentence.chunks[1].link = sentence.chunks[2]
        doc.sentences.append(sentence)
        sentence = nlelement.Sentence()
        chunk = self.maker.chunk(0, 1)
        chunk.tokens.append(self.maker.token('そこ', '名詞'))
        chunk.tokens.append(self.maker.token('に', '助詞'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        chunk = self.maker.chunk(0, 1)
        chunk.tokens.append(self.maker.token('美香', '名詞'))
        chunk.tokens.append(self.maker.token('が', '助詞'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        chunk = self.maker.chunk(0, 1)
        chunk.tokens.append(self.maker.token('い', '動詞'))
        chunk.tokens.append(self.maker.token('た', '助詞'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        sentence.chunks[0].link = sentence.chunks[2]
        sentence.chunks[1].link = sentence.chunks[2]
        doc.sentences.append(sentence)
        self.maker.set_id_to_sentences(doc) # NOTE: 共参照ラベルなどの値が正確に変換できないのでここでidを振る

        self.maker.add_coreference_link(doc, 0, 4, 'ga', 0, 0)
        self.maker.add_coreference_link(doc, 0, 4, 'ni', 0, 2)

        self.maker.add_coreference_link(doc, 1, 4, 'ga', 1, 2)
        self.maker.add_coreference_link(doc, 1, 4, 'ni', 1, 0)

        self.maker.add_coreference_link(doc, 1, 0, 'coref', 0, 2)


        return doc
Exemplo n.º 6
0
    def sample_pth_annotation(self):
        """pthのアノテーション文側用
        """
        doc = nlelement.Document()
        doc.pt_annotated = True
        sentence = nlelement.Sentence()
        sentence.pt_annotated = True
        chunk = self.maker.chunk(0, 1)
        chunk.tokens.append(self.maker.token('そこ', '名詞'))
        chunk.tokens.append(self.maker.token('に', '助詞'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        chunk = self.maker.chunk(0, 1)
        chunk.tokens.append(self.maker.token('美香', '名詞'))
        chunk.tokens.append(self.maker.token('が', '助詞'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        chunk = self.maker.chunk(0, 1)
        chunk.tokens.append(self.maker.token('い', '動詞'))
        chunk.tokens.append(self.maker.token('た', '助詞'))
        self.maker.append_chunk_to_sentence(sentence, chunk)
        sentence.chunks[0].link = sentence.chunks[2]
        sentence.chunks[1].link = sentence.chunks[2]
        doc.sentences.append(sentence)
        self.maker.set_id_to_sentences(doc)
        self.maker.add_semantic_role(doc, 0, 4, "経験者", 0, 2)
        self.maker.add_semantic_role(doc, 0, 4, "場所", 0, 0)
        self.maker.add_verb_semantic(doc, 0, 4, "状態変化なし(状態)-位置-存在")

        return doc
Exemplo n.º 7
0
 def parse(self, raw_sentence, sid=0):
     lattice = self.manalyzer.createLattice()
     lattice.set_sentence(raw_sentence)
     sentence = nlelement.Sentence()
     sentence.sid = sid
     self.parser.parse(lattice)
     for node in MeCabParser.iter_nor_node(lattice.bos_node):
         token = self.unpacker.unpack(node.surface, node.feature.split(','))
         token.tid = len(sentence.tokens)
         token.sid = sentence.sid
         sentence.tokens.append(token)
     return sentence
Exemplo n.º 8
0
 def sample1(self):
     """取りあえず単純なサンプルを生成する
     """
     doc = nlelement.Document()
     sentence = nlelement.Sentence()
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('私', '名詞'))
     chunk.tokens.append(self.maker.token('は', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('彼', '名詞'))
     chunk.tokens.append(self.maker.token('を', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     chunk = self.maker.chunk(0, 1)
     chunk.tokens.append(self.maker.token('助け', '動詞'))
     chunk.tokens.append(self.maker.token('た', '助詞'))
     self.maker.append_chunk_to_sentence(sentence, chunk)
     self.maker.set_link(sentence, 0, 2)
     self.maker.set_link(sentence, 1, 2)
     doc.sentences.append(sentence)
     self.maker.set_id_to_sentences(doc)
     return doc