예제 #1
0
def gold_singletons(base_directory):
    nps = reconcile.getNPs_annots(base_directory)
    golds = reconcile.parseGoldAnnots(base_directory)
    existentials = []

    for n in nps:
        for g in golds:
            if n.contains(g) or g.contains(n):
                break
        else:
            if n not in existentials:
                existentials.append(n)
    return existentials
예제 #2
0
# File Name : null-hunter.py
# Purpose :
# Creation Date : 12-22-2011
# Last Modified : Thu 22 Dec 2011 11:38:14 AM MST
# Created By : Nathan Gilbert
#
import sys

from pyconcile.document import Document
from pyconcile import reconcile

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: %s <filelist>" % (sys.argv[0]))
        sys.exit(1)

    fList = open(sys.argv[1], 'r')
    for f in fList:
        if f.startswith("#"):
            continue
        f = f.strip()
        print("Working on document: %s" % f)

        nps = reconcile.getNPs_annots(f)

        for np in nps:
            if np.getText() == "":
                print(np)

    fList.close()
예제 #3
0
                      default=-1)
    parser.add_option("-2",
                      "--anaphor",
                      help="The anaphor id",
                      action="store",
                      dest="anaphor",
                      type="int",
                      default=-1)

    (options, args) = parser.parse_args()
    if (len(sys.argv) < 2) or ((options.treefile == "") and \
            (options.featurefile == "")):
        parser.print_help()
        sys.exit(1)

    nps = reconcile.getNPs_annots(options.directory)
    antecedent = nps.getAnnotByID(options.antecedent)
    anaphor = nps.getAnnotByID(options.anaphor)

    #print antecedent.ppprint()
    #print anaphor.ppprint()

    features = reconcile.getFeatures(options.directory, options.featurefile)
    key = "%d,%d" % (options.antecedent, options.anaphor)
    pair_features = features[key]

    #for k in sorted(pair_features.keys()):
    #    print "%s = %s" % (k, str(pair_features[k]))

    #read in the tree
    treeFile = open(options.treefile, 'r')
예제 #4
0
#!/usr/bin/python
# File Name : gold_np_overlap.py
# Purpose : Prints out the response nps that overlap the gold NPs
# Creation Date : 11-22-2011
# Last Modified : Tue 22 Nov 2011 04:16:14 PM MST
# Created By : Nathan Gilbert
#
import sys

from pyconcile import reconcile

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: %s <dir>" % (sys.argv[0]))
        sys.exit(1)

    #read in the response nps
    response_nps = reconcile.getNPs_annots(sys.argv[1])

    #read in the gsNPs file and find the matched
    gs_nps = reconcile.getGSNPs(sys.argv[1])

    for r_np in response_nps:
        for g_np in gs_nps:
            if g_np.getATTR("MATCHED") == r_np.getID():
                print("C :%s" % r_np.pprint())
                break
        else:
            print("I :%s" % r_np.pprint())
예제 #5
0
    fileList = open(sys.argv[1], 'r')
    anaphoric_nominals = annotation_set.AnnotationSet("anaphoric_noms")
    existential_nominals = annotation_set.AnnotationSet("exist_noms")

    for f in fileList:
        f = f.strip()
        if f.startswith("#"):
            continue

        print("Working on %s" % f)

        gold_annots = reconcile.parseGoldAnnots(f)
        gold_chains = reconcile.getGoldChains(f)

        response_nps = reconcile.getNPs_annots(f)
        pos = reconcile.getPOS(f, True)
        reconcile.addSundanceProps(f, response_nps)
        utils.match_nps(gold_annots, response_nps)

        for g in gold_annots:
            if g.getATTR("GOLD_TYPE") != "NOM":
                continue

            for r in response_nps:
                if g.getATTR("MATCHED") == r.getID():
                    g.addProps(r.getProps())

            if g.getATTR("GRAMMAR") == "SUBJECT" or g.getATTR(
                    "SUN_ROLE") == "SUBJ":
                g.setProp("S_VERB", reconcile.getSubjVerb(g, pos))
예제 #6
0
def gold_annotations(f):
    """process the file with gold annotations"""

    global virtual_pronouns, total_counts, virtual_pronoun_heads, \
    nominal_base_antecedent, distance_from_antecedent

    doc = Document(f)
    gold_chains = reconcile.getGoldChains(f)

    #adding in Sundance nes.
    nes = reconcile.getNEs(f, True)
    add_reconcile_semantic_class(gold_chains, nes)

    #adding in Reconcile pos too.
    pos = reconcile.getPOS(f, True)

    #getting the docs nps
    reconcile_nps = reconcile.getNPs_annots(f)

    #getting sundance nps
    sundance_nps = reconcile.getSundanceNPs(f)
    add_sundance_nps(gold_chains, sundance_nps)

    original_text_heads = {}  # just getting the heads
    original_text = defaultdict(list)  # for getting total doc counts later.
    nominal2chains = defaultdict(
        list)  # the chains that a given nominal appears.

    for chain in list(gold_chains.keys()):
        base_antecedent = True
        prev_annot = None
        antecedents = 0
        for mention in gold_chains[chain]:

            #if the first antecedent in a chain, do not list it as anaphoric.
            if base_antecedent:
                if mention.getATTR("is_nominal") and not \
                mention.getATTR("GOLD_SINGLETON"):
                    text = mention.getText()
                    text_lower = mention.getATTR("TEXT_CLEAN").lower()
                    docs_appeared[text_lower].append(f)

                    nominal_base_antecedent[text_lower] = \
                    nominal_base_antecedent.get(text_lower, 0) + 1

                    original_text[text_lower].append(text)

                    #take note that this chain contained this nominal
                    nominal2chains[text_lower].append(chain)

                    #take note of the gold semantic class
                    gold_semantic_class[text_lower].append(
                        mention.getATTR("GOLD_SEMANTIC"))

                    #reconcile's semantic class
                    reconcile_semantic_class[text_lower].append(
                        mention.getATTR("NE_CLASS"))

                    #sundance's semantic class
                    sun_semantic_class[text_lower].append(
                        mention.getATTR("SUN_SEMANTIC"))

                    number_gold_antecedents[text_lower].append(antecedents)

                    #get verb stats
                    if mention.getATTR("ROLE") == "SUBJ":
                        verb = reconcile.getSubjVerb(mention, pos)
                        if verb != None:
                            subj_verbs[text_lower].append(verb.lower())
                    elif mention.getATTR("ROLE") == "DOBJ":
                        verb = reconcile.getObjVerb(mention, pos)
                        if verb != None:
                            obj_verbs[text_lower].append(verb.lower())

                base_antecedent = False
                prev_annot = mention
                antecedents += 1
                continue

            if mention.getATTR("is_nominal"):
                text = mention.getText()
                text_lower = mention.getATTR("TEXT_CLEAN").lower()
                head_text = mention.getATTR("HEAD_TEXT")

                original_text[text_lower].append(text)
                virtual_pronouns[text_lower] = \
                virtual_pronouns.get(text_lower, 0) + 1

                virtual_pronoun_heads[head_text.lower()] = \
                virtual_pronoun_heads.get(head_text.lower(), 0) + 1

                #the semantic class Reconcile puts this in.
                reconcile_semantic_class[text_lower].append(
                    mention.getATTR("NE_CLASS"))

                #register this doc as containing this np.
                docs_appeared[text_lower].append(f)

                #take note that this chain contained this nominal
                nominal2chains[text_lower].append(chain)

                #take note of the gold semantic class
                gold_semantic_class[text_lower].append(
                    mention.getATTR("GOLD_SEMANTIC"))

                #the number of possible correct antecedents for this anaphor
                number_gold_antecedents[text_lower].append(antecedents)

                #sundance's semantic class
                sun_semantic_class[text_lower].append(
                    mention.getATTR("SUN_SEMANTIC"))

                # subject verb statistics
                if mention.getATTR("ROLE") == "SUBJ":
                    verb = reconcile.getSubjVerb(mention, pos)
                    subj_verbs[text_lower].append(verb.lower())
                elif mention.getATTR("ROLE") == "DOBJ":
                    verb = reconcile.getObjVerb(mention, pos)
                    obj_verbs[text_lower].append(verb.lower())

                #get the sentence distance from these two mentions.
                mention_sent = reconcile.getAnnotSentence(f, mention)
                prev_sent = reconcile.getAnnotSentence(f, prev_annot)

                if mention_sent > -1 and prev_sent > -1:
                    distance_from_antecedent[text_lower].append(mention_sent - \
                            prev_sent)

                #get the TextTiling segment distance for the two mentions
                mention_seg = doc.getAnnotTile(mention)
                prev_seg = doc.getAnnotTile(prev_annot)
                if mention_seg > -1 and prev_seg > -1:
                    focus_distance[text_lower].append(mention_seg - \
                            prev_seg)

                #getting the distribution of closest antecedent types for a
                #given nominal
                if prev_annot.getATTR("is_nominal"):
                    nominals2type[text_lower]["nominal"] = \
                    nominals2type[text_lower].get("nominal",0) + 1
                elif prev_annot.getATTR("is_pronoun"):
                    nominals2type[text_lower]["pronoun"] = \
                    nominals2type[text_lower].get("pronoun",0) + 1
                else:
                    nominals2type[text_lower]["proper"] = \
                    nominals2type[text_lower].get("proper",0) + 1
            prev_annot = mention
            antecedents += 1

    #for key in nominal2chains.keys():
    #    print "%d : %s (doc: %s)" % (len(list(set(nominal2chains[key]))), key,
    #            doc)

    #update the total counts.
    for key in list(original_text.keys()):
        for text in list(set(original_text[key])):
            total_counts[key] = total_counts.get(key,
                                                 0) + doc.getWordCounts(text)

    #the head counts
    for key in list(virtual_pronoun_heads.keys()):
        total_counts_heads[key] = total_counts_heads.get(key, 0) + \
        doc.getWordCounts(key)