Exemplo n.º 1
0
 def test_mem_text_mem(self):
     doc = self.samples.sample3_arguments()
     dump = cabocha_extended.dump(doc)
     #with open('dat/log/cabocha_test.txt', 'w', encoding='utf-8') as file:
     #    file.write(dump)
     doc2 = cabocha_extended.load_from_text(dump)[0]
     #dump2 = cabocha_extended.dump(doc2)
     #with open('dat/log/cabocha_test.txt', 'w', encoding='utf-8') as file:
     #    file.write(dump2)
     
     for tok, tok2 in zip(nlelement.tokens(doc), nlelement.tokens(doc2)):
         self.assertEqual(tok.surface, tok2.surface)
         if hasattr(tok, "predicate_term"):
             for item1, item2 in zip(tok.predicate_term.items(), tok2.predicate_term.items()):
                 self.assertEqual(item1[0], item2[0])
                 self.assertEqual(len(item1[1]), len(item2[1]))
                 for val1, val2 in zip(item1[1], item2[1]):
                     self.assertEqual(val1.ana_ref(), val2.ana_ref())
                     self.assertEqual(val1.ant_ref(), val2.ant_ref())
                     self.assertAlmostEqual(val1.label, val2.label)
                     self.assertAlmostEqual(val1.probable, val2.probable)
                     self.assertEqual(val1.case, item1[0])
                     self.assertEqual(val2.case, item2[0])    
         elif hasattr(tok2, "predicate_term"):
             self.fail("predicate argument located invalid position")                    
         if hasattr(tok, "coreference"):
             for val1, val2 in zip(tok.coreference, tok2.coreference):
                 self.assertEqual(val1.ana_ref(), val2.ana_ref())
                 self.assertEqual(val1.ant_ref(), val2.ant_ref())
                 self.assertAlmostEqual(val1.label, val2.label)
                 self.assertAlmostEqual(val1.probable, val2.probable)
         elif hasattr(tok2, "coreference"):
             self.fail("coreference argument located invalid position")
Exemplo n.º 2
0
 def __get_refered_entities__(document: nlelement.Document,
                              dump_type='scored_output'):
     """共参照関係の先行詞、述語項、意味役割になりうる項を列挙
         コーパスのラベルを参照する場合(from_label=True),coreference_link, semrolesから取る
         ツール出力値を参照する場合(from_label=False)predicate_term, coreference, semroleから取得する
     """
     refered_entities = []
     if dump_type in ['label', 'result']:
         for tok in nlelement.tokens(document):
             if hasattr(tok, "coreference_link"):
                 for key, value in tok.coreference_link.items():
                     if value.antecedent_ref is None:
                         continue
                     refered_entities.append(value.antecedent_ref)
             if hasattr(tok, "semroles"):
                 for key, value in tok.semroles.items():
                     # 確か
                     if value is None:
                         continue
                     refered_entities.append(value)
                 refered_entities.sort()
     if dump_type in ['scored_output', 'result', 'standard']:
         for tok in nlelement.tokens(document):
             if hasattr(tok, "predicate_term"):
                 for key, values in tok.predicate_term.items():
                     for value in values:
                         if dump_type in ['result', 'standard'
                                          ] and value.label == 0.0:
                             continue
                         ant_ref = value.ant_ref()
                         if ant_ref is None:
                             continue
                         refered_entities.append(ant_ref)
             if hasattr(tok, "coreference"):
                 values = getattr(tok, "coreference")
                 for value in values:
                     if dump_type in ['result', 'standard'
                                      ] and value.label == 0.0:
                         continue
                     ant_ref = value.ant_ref()
                     if ant_ref is None:
                         continue
                     refered_entities.append(ant_ref)
             if hasattr(tok, "semrole"):
                 for key, values in tok.semrole.items():
                     for value in values:
                         if dump_type in ['result', 'standard'
                                          ] and value.label == 0.0:
                             continue
                         ant_ref = value.ant_ref()
                         if ant_ref is None:
                             continue
                         refered_entities.append(ant_ref)
     return refered_entities
Exemplo n.º 3
0
def load_to_tuple(text, single_doc=False):
    """synchacallモジュールの結果形式に適合した形式でロード
    (形態素、構文情報は戻り値に含まれない)
    """
    loader = CabochaLoader('')
    file = io.StringIO(text)
    if single_doc:
        doc = loader.__load_document__(file)[0]
        docs = [doc]
    else:
        docs = loader.__load_document__(file)[0]
    file.close()
    result = []
    for doc in docs:
        pred_tups = []
        coref_tups = []
        for tok in nlelement.tokens(doc):
            if hasattr(tok, "predicate_term"):
                for case, args in tok.predicate_term.items():
                    for arg in args:
                        pred_tups.append(
                            (*arg.ant_ref.to_tuple(), *arg.ana_ref.to_tuple(),
                             case, arg.label, arg.probable))
            if hasattr(tok, "coreference"):
                #for arg in tok.coreference:
                args = tok.coreference
                for arg in args:
                    coref_tups.append(*arg.ant_ref.to_tuple(),
                                      *arg.ana_ref.to_tuple(), arg.label,
                                      arg.probable)
        result.append((pred_tups, coref_tups, []))
    if single_doc:
        return result[0]
    return result
Exemplo n.º 4
0
 def __merge_chunkofdoc__(self, orig, annotation):
     """annotationにある文節の区切り情報を基にorigにChunkオブジェクトを追加する
     """
     orig_count = 0
     anno_count = 0
     orig_to, anno_to = 0, 0
     orig_cid, anno_cid = 0, 0
     orig_titer = iter(nlelement.tokens(orig))
     for chunk in nlelement.chunks(annotation):
         orig_chunk = nlelement.Chunk()
         orig_ti, anno_ti = -1, -1
         orig_surf, anno_surf = '', ''
         for tok in chunk.tokens:
             anno_surf += tok.surface
             anno_count += tok.get_length()
             anno_ti += 1
             while orig_count < anno_count:
                 try:
                     orig_tok = next(orig_titer)
                 except StopIteration:
                     return
                 orig_count += orig_tok.get_length()
                 orig_chunk.tokens.append(orig_tok)
                 orig_ti += 1
                 orig_surf += orig_tok.surface
             if orig_count == anno_count:
                 if orig_surf != anno_surf:
                     #print('orig:{0} != anno:{1}'.format(orig_surf, anno_surf))
                     self.error_count += 1
                     # TODO:修正用の処理を加える
                 orig_surf = ''
                 anno_surf = ''
             if anno_ti == chunk.head_position:
                 orig_chunk.head_position = orig_ti
             elif anno_ti == chunk.func_position:
                 orig_chunk.func_position = orig_ti
         for chunk in self.__detect_chunk_border__(orig_chunk):
             if chunk.sid < 0:
                 chunk.sid = orig_titer.cur_sent.sid
             orig_sent = orig.refer_sentence(chunk.sid)
             chunk.cid = len(orig_sent.chunks)
             orig_sent.chunks.append(chunk)
Exemplo n.º 5
0
 def postproccess_doc(document: nlelement.Document):
     for tok in nlelement.tokens(document):
         if hasattr(tok, "entity_id"):
             delattr(tok, "entity_id")
         if hasattr(tok, "entity_links"):
             delattr(tok, "entity_links")
Exemplo n.º 6
0
    def __set_entity_links__(document: nlelement.Document,
                             coref_id_table,
                             entity_id_table,
                             dump_type='scored_output'):
        coref_id = 1
        if dump_type in ['label', 'result']:
            for tok in nlelement.tokens(document):
                if hasattr(tok, "coreference_link"):
                    for key, value in tok.coreference_link.items():
                        ref = value.antecedent_ref
                        if not hasattr(tok, "entity_links"):
                            tok.entity_links = dict()
                        if key == "coref":
                            label_key = "eq" if dump_type != "result" else "label_eq"
                            if label_key not in tok.entity_links:
                                tok.entity_links[label_key] = []
                            if ref is not None and ref.to_tuple(
                            ) in coref_id_table:
                                coref_id_table[nlelement.make_reference(
                                    tok).to_tuple()] = coref_id_table[
                                        ref.to_tuple()]
                                tok.entity_links[label_key].append(
                                    (coref_id_table[ref.to_tuple()], 1.0, 0.0))
                            elif ref is not None:
                                coref_id_table[ref.to_tuple()] = coref_id
                                coref_id_table[nlelement.make_reference(
                                    tok).to_tuple()] = coref_id
                                coref_id += 1
                                ant = document.refer(ref)
                                if not hasattr(ant, "entity_links"):
                                    ant.entity_links = {}
                                if label_key not in ant.entity_links:
                                    ant.entity_links[label_key] = []
                                ant.entity_links[label_key].append(
                                    (coref_id_table[ref.to_tuple()], 1.0, 0.0))
                                tok.entity_links[label_key].append(
                                    (coref_id_table[ref.to_tuple()], 1.0, 0.0))
                            elif dump_type == 'result':
                                # ↑外界照応が適用されると非推奨になるかも
                                tok.entity_links[label_key].append(
                                    (-1, 1.0, 0.0))
                        else:
                            label_key = key if dump_type != "result" else "label_" + key
                            if label_key not in tok.entity_links:
                                tok.entity_links[label_key] = []
                            if ref is not None and ref.to_tuple(
                            ) in entity_id_table:
                                tok.entity_links[label_key].append(
                                    (entity_id_table[ref.to_tuple()], 1.0,
                                     0.0))
                            elif dump_type == 'result':
                                # ↑外界照応が適用されると非推奨になるかも
                                tok.entity_links[label_key].append(
                                    (-1, 1.0, 0.0))
                if hasattr(tok, "semroles"):
                    for key, value in tok.semroles.items():
                        label_key = key if dump_type != "result" else "label_" + key
                        ref = value
                        if not hasattr(tok, "entity_links"):
                            tok.entity_links = dict()
                        if label_key not in tok.entity_links:
                            tok.entity_links[label_key] = []
                        if ref is not None and ref.to_tuple(
                        ) in entity_id_table:
                            tok.entity_links[label_key].append(
                                (entity_id_table[ref.to_tuple()], 1.0, 0.0))
                        elif dump_type == 'result':
                            # ↑外界照応が適用されると非推奨になるかも
                            tok.entity_links[label_key].append((-1, 1.0, 0.0))
        if dump_type in ['scored_output', 'result', 'standard']:
            for tok in nlelement.tokens(document):
                if hasattr(tok, "predicate_term"):
                    for key, values in tok.predicate_term.items():
                        for value in values:
                            if dump_type in ['result', 'standard'
                                             ] and value.label == 0.0:
                                continue
                            ref = nlelement.TokenReference(
                                value.ant_sid, value.ant_tid)
                            if not hasattr(tok, "entity_links"):
                                tok.entity_links = dict()
                            if key not in tok.entity_links:
                                tok.entity_links[key] = []
                            tok.entity_links[key].append(
                                (entity_id_table[ref.to_tuple()], value.label,
                                 value.probable))

                if hasattr(tok, "coreference"):
                    values = tok.coreference
                    for value in values:
                        if dump_type in ['result', 'standard'
                                         ] and value.label == 0.0:
                            continue
                        ref = nlelement.TokenReference(value.ant_sid,
                                                       value.ant_tid)
                        if not hasattr(tok, "entity_links"):
                            tok.entity_links = dict()
                        if 'eq' not in tok.entity_links:
                            tok.entity_links['eq'] = []
                        if ref.to_tuple() in coref_id_table:
                            coref_id_table[nlelement.make_reference(
                                tok).to_tuple()] = coref_id_table[
                                    ref.to_tuple()]
                        else:
                            coref_id_table[ref.to_tuple()] = coref_id
                            coref_id_table[nlelement.make_reference(
                                tok).to_tuple()] = coref_id
                            coref_id += 1
                        tok.entity_links['eq'].append(
                            (coref_id_table[ref.to_tuple()], value.label,
                             value.probable))
                if hasattr(tok, "semrole"):
                    for key, values in tok.semrole.items():
                        for value in values:
                            if dump_type in ['result', 'standard'
                                             ] and value.label == 0.0:
                                continue
                            ref = nlelement.TokenReference(
                                value.ant_sid, value.ant_tid)
                            if not hasattr(tok, "entity_links"):
                                tok.entity_links = dict()
                            if key not in tok.entity_links:
                                tok.entity_links[key] = []
                            if ref.to_tuple() in entity_id_table:
                                tok.entity_links[key].append(
                                    (entity_id_table[ref.to_tuple()],
                                     value.label, value.probable))
Exemplo n.º 7
0
 def __resolve_entity_id__(self, doc):
     """書かれていたidの一覧をすべて解決してtokenのメンバに代入
     """
     last_entities = {}
     #print("\nresolve entity id")
     for tok in nlelement.tokens(doc):
         if hasattr(tok, "entity_links"):
             if not self.as_label:
                 for key, value in tok.entity_links.items():
                     if key in ['ga', 'o', 'ni']:
                         for entity_tup in value:
                             if entity_tup[0] in self.entity_ids:
                                 if not hasattr(tok, "predicate_term"):
                                     tok.predicate_term = dict()
                                 if key not in tok.predicate_term:
                                     tok.predicate_term[key] = []
                                 ana_ref = nlelement.make_reference(tok)
                                 ant_tok = self.entity_ids[entity_tup[0]]
                                 ant_ref = nlelement.make_reference(
                                     ant_tok) if not isinstance(
                                         ant_tok, nlelement.ExoReference
                                     ) else ant_tok
                                 tok.predicate_term[key].append(
                                     argument.PredicateArgument(
                                         *ana_ref.to_tuple(),
                                         *ant_ref.to_tuple(), key,
                                         *entity_tup[1:]))
                     elif key == "eq":
                         for entity_tup in value:
                             if entity_tup[0] in last_entities:
                                 ana_ref = nlelement.make_reference(tok)
                                 ant_ref = last_entities[entity_tup[0]]
                                 if not hasattr(tok, 'coreference'):
                                     setattr(tok, 'coreference', [])
                                 tok.coreference.append(
                                     argument.CoreferenceArgument(
                                         *ana_ref.to_tuple(),
                                         *ant_ref.to_tuple(),
                                         *entity_tup[1:]))
                             last_entities[entity_tup[
                                 0]] = nlelement.make_reference(tok)
                     else:
                         for entity_tup in value:
                             if entity_tup in self.entity_ids:
                                 if not hasattr(tok, "semrole"):
                                     tok.semrole = dict()
                                 if key not in tok.semrole:
                                     tok.semrole[key] = []
                                 ant_tok = self.entity_ids[entity_tup]
                                 ant_ref = nlelement.make_reference(
                                     ant_tok) if not isinstance(
                                         ant_tok, nlelement.ExoReference
                                     ) else ant_tok
                                 ana_ref = nlelement.make_reference(tok)
                                 tok.semrole[key].append(
                                     argument.PredicateArgument(
                                         *ana_ref.to_tuple(),
                                         *ant_ref.to_tuple(), key,
                                         *entity_tup[1:]))
             else:
                 for key, value in tok.entity_links.items():
                     if key in ['ga', 'o', 'ni', "eq"]:
                         for entity_tup in value:
                             if key == "eq":
                                 if entity_tup[0] in last_entities:
                                     ana_ref = nlelement.make_reference(tok)
                                     ant_tok = last_entities[entity_tup[0]]
                                     ant_ref = nlelement.make_reference(
                                         ant_tok)
                                     if not hasattr(tok,
                                                    "coreference_link"):
                                         tok.coreference_link = dict()
                                     tok.coreference_link[
                                         key] = nlelement.CoreferenceEntry(
                                             ana_ref, ant_ref, None, None,
                                             '')
                                 last_entities[entity_tup[
                                     0]] = nlelement.make_reference(tok)
                             elif entity_tup[0] in self.entity_ids:
                                 if entity_tup[1] != 1.0:
                                     continue
                                 if not hasattr(tok, "coreference_link"):
                                     tok.coreference_link = dict()
                                 if key not in tok.coreference_link:
                                     tok.coreference_link[key] = []
                                 ana_ref = nlelement.make_reference(tok)
                                 ant_tok = self.entity_ids[entity_tup[0]]
                                 ant_ref = nlelement.make_reference(
                                     ant_tok) if not isinstance(
                                         ant_tok, nlelement.ExoReference
                                     ) else ant_tok
                                 tok.coreference_link[
                                     key] = nlelement.CoreferenceEntry(
                                         ana_ref, ant_ref, None, None, '')
                     else:
                         for entity_tup in value:
                             if entity_tup in self.entity_ids:
                                 if entity_tup[1] != 1.0:
                                     continue
                                 if not hasattr(tok, "semroles"):
                                     tok.semrole = dict()
                                 if key not in tok.semrole:
                                     tok.semrole[key] = []
                                 ant_tok = self.entity_ids[entity_tup[0]]
                                 tok.semroles[
                                     key] = nlelement.make_reference(
                                         ant_tok) if not isinstance(
                                             ant_tok, nlelement.ExoReference
                                         ) else ant_tok
             delattr(tok, "entity_links")
Exemplo n.º 8
0
"""テキストを解析してdocにしてから単語のbi_gramを作成
"""
from nlelement.loaders import MeCabParser
from nlelement import nlelement

SAMPLE_TEXT = """テキストの本文です。
MeCabによって解析を行ってオブジェクトに変換し、bigramにします。
bigramは辞書オブジェクトとして保存されます。
"""
parser = MeCabParser()
doc = parser.parse_document(SAMPLE_TEXT)
# ※docはデータベースなどからすでに取得済みであると仮定
last_tok = None
n_gram_table = {}
for tok in nlelement.tokens(doc):
    if last_tok:
        key = (last_tok.surface, tok.surface)
        if key not in n_gram_table:
            n_gram_table[key] = 0
        n_gram_table[key] += 1
    last_tok = tok
print(n_gram_table)