Пример #1
0
    def enrich_treebank(self, a_treebank):
        abstract_bank.enrich_treebank(self, a_treebank)

        for a_speaker_document in self:
            sys.stderr.write(".")

            for a_speaker_sentence in a_speaker_document:
                a_tree = a_speaker_document.tree_document[a_speaker_sentence.line_number]
                a_speaker_sentence.enrich_tree(a_tree)

        sys.stderr.write("\n")
Пример #2
0
    def enrich_treebank(self, a_treebank, a_cursor=None):
        total_ne_node_mismatches = 0
        total_nes = 0

        total_ne_non_terminals = 0
        total_ne_terminals = 0

        abstract_bank.enrich_treebank(self, a_treebank)

        #--------------------------------------------------------------------------------#
        # now that we have initialized the names, we can go through them
        # one by one, and tag the nodes in the tree with those names
        #--------------------------------------------------------------------------------#

        #---- for each document in the list of name tagged documents ----#
        for a_name_tagged_document in self:
            sys.stderr.write(".")

            a_name_entity_sets = a_name_tagged_document.name_entity_sets
            a_tree_document = a_name_tagged_document.tree_document

            if len(a_name_entity_sets) > len(a_tree_document):
                on.common.log.report(
                    "name",
                    " found a mismatch in number of elements in the lists SERIOUS",
                    nes=len(a_name_entity_sets),
                    tids=len(a_tree_document),
                    sets=a_name_entity_sets)
                for a_name_entity_set in a_name_entity_sets:
                    for a_name_entity_list in a_name_entity_set:
                        for a_name_entity in a_name_entity_list:
                            a_name_entity.valid = False
                continue

            while len(a_name_entity_sets) < len(a_tree_document):
                a_name_tagged_document.name_entity_sets.append(
                    name_entity_set(a_tree_document.document_id))

#             for a_tree, a_document_sentence in zip(a_tree_document, a_name_tagged_document.document_sentences):
#                 print "-===============------------================-"
#                 print a_tree.get_word_string
#                 print
#                 print
#                 print
#                 print a_document_sentence

#---- for each sentence in the document ----#
            for sentence_no, (a_tree, a_name_entity_set) in enumerate(
                    zip(a_tree_document, a_name_entity_sets)):
                #---- for each name type tagged in the sentence ----#
                for a_name_entity_type in a_name_entity_set.name_entity_hash:
                    #---- for each name instance in that type in the sentence ----#
                    for a_name_entity in a_name_entity_set.name_entity_hash[
                            a_name_entity_type]:
                        #---- try to get a legal node that aligns with this name ----#
                        a_subtree_id = None

                        old_str = a_name_entity.string
                        old_swi = a_name_entity.start_word_index
                        old_ewi = a_name_entity.end_word_index
                        old_sti = a_name_entity.start_token_index
                        old_eti = a_name_entity.end_token_index

                        assert a_tree.get_sentence_index(
                        ) == a_name_entity.sentence_index, (a_tree,
                                                            a_name_entity)

                        a_name_entity.enrich_tree(a_tree)

                        if not a_name_entity.start_leaf or not a_name_entity.end_leaf:
                            continue

                        a_name_entity.check_tree_alignment()

                        a_subtree = a_name_entity.subtree
                        if a_subtree:
                            total_nes += 1

                            if a_subtree.is_leaf():
                                total_ne_terminals += 1
                            else:
                                total_ne_non_terminals += 1

                        #---- if there is no legal tree node aligning with the name ----#
                        else:
                            total_ne_node_mismatches += 1

                        new_str = a_name_entity.string

        #--------------------------------------------------------------------------------#
        # now that we have traversed all the names in the name bank,
        # we will show the summary statistics of how many of them had
        # nodes in the tree aligning with them, etc.
        #--------------------------------------------------------------------------------#
        sys.stderr.write("\n")

        if (on.common.log.DEBUG == True
                and on.common.log.VERBOSITY >= on.common.log.MAX_VERBOSITY):
            sys.stderr.write("total nes: " + str(total_nes) + "\n")
            sys.stderr.write("total ne-node mismatches: " +
                             str(total_ne_node_mismatches) + "\n")
            sys.stderr.write("total ne-terminals: " + str(total_ne_terminals) +
                             "\n")
            sys.stderr.write("total ne-non-terminals: " +
                             str(total_ne_non_terminals) + "\n")
Пример #3
0
    def enrich_treebank(self, a_treebank, a_cursor=None):
        total_ne_node_mismatches = 0
        total_nes = 0

        total_ne_non_terminals = 0
        total_ne_terminals = 0

        abstract_bank.enrich_treebank(self, a_treebank)

        #--------------------------------------------------------------------------------#
        # now that we have initialized the names, we can go through them
        # one by one, and tag the nodes in the tree with those names
        #--------------------------------------------------------------------------------#

        #---- for each document in the list of name tagged documents ----#
        for a_name_tagged_document in self:
            sys.stderr.write(".")

            a_name_entity_sets = a_name_tagged_document.name_entity_sets
            a_tree_document = a_name_tagged_document.tree_document

            if len(a_name_entity_sets) > len(a_tree_document):
                on.common.log.report("name", " found a mismatch in number of elements in the lists SERIOUS",
                                     nes=len(a_name_entity_sets), tids=len(a_tree_document),
                                     sets=a_name_entity_sets)
                for a_name_entity_set in a_name_entity_sets:
                    for a_name_entity_list in a_name_entity_set:
                        for a_name_entity in a_name_entity_list:
                            a_name_entity.valid = False
                continue


            while len(a_name_entity_sets) < len(a_tree_document):
                a_name_tagged_document.name_entity_sets.append(name_entity_set(a_tree_document.document_id))



#             for a_tree, a_document_sentence in zip(a_tree_document, a_name_tagged_document.document_sentences):
#                 print "-===============------------================-"
#                 print a_tree.get_word_string
#                 print
#                 print
#                 print
#                 print a_document_sentence



            #---- for each sentence in the document ----#
            for sentence_no, (a_tree, a_name_entity_set) in enumerate(zip(a_tree_document, a_name_entity_sets)):
                #---- for each name type tagged in the sentence ----#
                for a_name_entity_type in a_name_entity_set.name_entity_hash:
                    #---- for each name instance in that type in the sentence ----#
                    for a_name_entity in a_name_entity_set.name_entity_hash[a_name_entity_type]:
                        #---- try to get a legal node that aligns with this name ----#
                        a_subtree_id = None

                        old_str = a_name_entity.string
                        old_swi = a_name_entity.start_word_index
                        old_ewi = a_name_entity.end_word_index
                        old_sti = a_name_entity.start_token_index
                        old_eti = a_name_entity.end_token_index

                        assert a_tree.get_sentence_index() == a_name_entity.sentence_index, (a_tree, a_name_entity)

                        a_name_entity.enrich_tree(a_tree)

                        if not a_name_entity.start_leaf or not a_name_entity.end_leaf:
                            continue


                        a_name_entity.check_tree_alignment()

                        a_subtree = a_name_entity.subtree
                        if a_subtree:
                            total_nes += 1

                            if a_subtree.is_leaf():
                                total_ne_terminals += 1
                            else:
                                total_ne_non_terminals += 1

                        #---- if there is no legal tree node aligning with the name ----#
                        else:
                            total_ne_node_mismatches += 1

                        new_str = a_name_entity.string

        #--------------------------------------------------------------------------------#
        # now that we have traversed all the names in the name bank,
        # we will show the summary statistics of how many of them had
        # nodes in the tree aligning with them, etc.
        #--------------------------------------------------------------------------------#
        sys.stderr.write("\n")

        if(on.common.log.DEBUG == True and on.common.log.VERBOSITY >= on.common.log.MAX_VERBOSITY):
            sys.stderr.write("total nes: " + str(total_nes) + "\n")
            sys.stderr.write("total ne-node mismatches: " + str(total_ne_node_mismatches) + "\n")
            sys.stderr.write("total ne-terminals: " + str(total_ne_terminals) + "\n")
            sys.stderr.write("total ne-non-terminals: " + str(total_ne_non_terminals) + "\n")