help="Use all heuristics.",
                      action="store_true",
                      dest="all",
                      default=False)
    (options, args) = parser.parse_args()

    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit(1)

    if options.verbose:
        VERBOSE = True

    if options.filelist is not None:
        fList = open(options.filelist, 'r')
        total_start_time = time.time()
        for f in fList:
            if f.startswith("#"):
                continue
            f = f.strip()
            start_time = time.time()
            print("Processing document: %s" % f)
            d = Document(f)
            num_pairs = process_doc(d, options)
            end_time = time.time()
            print("process time: %0.3f seconds :: %d pairs added" %
                  ((end_time - start_time, num_pairs)))
        total_end_time = time.time()
        print("Total process time: %0.3f seconds" %
              ((total_end_time - total_start_time)))
示例#2
0
    head2text = defaultdict(list)

    #sys.stdout.flush()
    #sys.stdout.write("\r")
    #prog = ProgressBar(len(files))
    i = 0
    for f in files:
        if f.startswith("#"): continue

        #prog.update_time(i)
        #sys.stdout.write("\r%s" % (str(prog)))
        #sys.stdout.flush()

        i += 1
        f = f.strip()
        doc = Document(f)
        gold_nps = reconcile.getNPs(f)
        gold_chains = reconcile.getGoldChains(f)
        doc.addGoldChains(gold_chains)

        for np in gold_nps:
            text = utils.textClean(np.getText().lower()).strip()
            if TRUE_PRONOUNS:
                if text in TRUE:
                    add_stats(text, np, doc, nouns, head2text)
            else:
                if specificity_utils.isNominal(np):
                    #head = getHead(text)
                    #if head.endswith("%"): continue #skip percents
                    #if head[-1].isdigit(): continue #skip numbers
                    #if utils.isConj(head): continue #just skip these guys too
    it = {}
    third_person_plural = {}

    sys.stdout.flush()
    sys.stdout.write("\r")
    prog = ProgressBar(len(files))
    i = 0
    for f in files:
        if f.startswith("#"): continue
        f = f.strip()
        prog.update_time(i)
        sys.stdout.write("\r%s" % (str(prog)))
        sys.stdout.flush()
        i += 1

        doc = Document(f)
        #NOTE: still assuming that gold mentions are being supplied via
        #Reconcile.
        gold_nps = reconcile.getNPs(f)
        gold_chains = reconcile.getGoldChains(f)
        doc.addGoldChains(gold_chains)

        for np in gold_nps:
            text = utils.textClean(np.getText().lower()).strip()
            if text in data.THIRD_PERSON:
                #then it is he, him, she
                add_stats(third_person, doc, np, text)
            elif (text in data.IT) and (text != "i"):
                #then we have 'it' or 'its'
                add_stats(it, doc, np, text)
            elif text in data.THIRD_PERSON_PLURAL:
    #the master dictionary of text to nominals 
    text2nominal = {}

    #something easy to pickle
    noun2antecedents = defaultdict(dict)

    fileList = open(options.filelist, 'r')
    #lines that start with # are ignored
    files = [x for x in fileList.readlines() if not x.startswith("#")]
    fileList.close()
    for f in files:
        f = f.strip()
        print("Working on document: %s" % f)

        #load in the document statistics
        d = Document(f)

        #the total number of sentences in this text file. 
        #?double check with nltk?
        total_sentences_doc = len(reconcile.getSentences(f))

        #process a document, get all the nominal stats that are requested. 
        gold_chains = reconcile.getGoldChains(f, True)
        d.addGoldChains(gold_chains)

        for gc in list(gold_chains.keys()):
            base_antecedent = True
            previous_semantic_tag = ""
            prev_sent = -1
            prev_tile = -1
            prev_type = ""
示例#5
0
        files.extend([x for x in fileList.readlines() if not x.startswith("#")])

    sys.stdout.flush()
    sys.stdout.write("\r")
    prog = ProgressBar(len(files))
    i = 0
    nominals = {}
    for f in files:
        if f.startswith("#"): continue
        f=f.strip()
        prog.update_time(i)
        sys.stdout.write("\r%s" % (str(prog)))
        sys.stdout.flush()
        i += 1

        doc = Document(f)
        #NOTE: still assuming that gold mentions are being supplied via
        #Reconcile.
        gold_nps = reconcile.getNPs(f)
        gold_chains = reconcile.getGoldChains(f)
        doc.addGoldChains(gold_chains)

        for np in gold_nps:
            text = utils.textClean(np.getText().lower()).strip()
            if (text in data.ALL_PRONOUNS):
                continue

            #if specificity_utils.isProper(np):
            #    continue
            anaphor_np = gold_nps.getAnnotBySpan(np.getStart(), np.getEnd())
            if anaphor_np["PROPER_NAME"] != "true" and anaphor_np["PROPER_NOUN"] != "true":
def gold_annotations(f):
    """process the file with gold annotations"""

    global virtual_pronouns, total_counts, virtual_pronoun_heads, \
    nominal_base_antecedent, distance_from_antecedent

    doc = Document(f)
    gold_chains = reconcile.getGoldChains(f)

    #adding in Sundance nes.
    nes = reconcile.getNEs(f, True)
    add_reconcile_semantic_class(gold_chains, nes)

    #adding in Reconcile pos too.
    pos = reconcile.getPOS(f, True)

    #getting the docs nps
    reconcile_nps = reconcile.getNPs_annots(f)

    #getting sundance nps
    sundance_nps = reconcile.getSundanceNPs(f)
    add_sundance_nps(gold_chains, sundance_nps)

    original_text_heads = {}  # just getting the heads
    original_text = defaultdict(list)  # for getting total doc counts later.
    nominal2chains = defaultdict(
        list)  # the chains that a given nominal appears.

    for chain in list(gold_chains.keys()):
        base_antecedent = True
        prev_annot = None
        antecedents = 0
        for mention in gold_chains[chain]:

            #if the first antecedent in a chain, do not list it as anaphoric.
            if base_antecedent:
                if mention.getATTR("is_nominal") and not \
                mention.getATTR("GOLD_SINGLETON"):
                    text = mention.getText()
                    text_lower = mention.getATTR("TEXT_CLEAN").lower()
                    docs_appeared[text_lower].append(f)

                    nominal_base_antecedent[text_lower] = \
                    nominal_base_antecedent.get(text_lower, 0) + 1

                    original_text[text_lower].append(text)

                    #take note that this chain contained this nominal
                    nominal2chains[text_lower].append(chain)

                    #take note of the gold semantic class
                    gold_semantic_class[text_lower].append(
                        mention.getATTR("GOLD_SEMANTIC"))

                    #reconcile's semantic class
                    reconcile_semantic_class[text_lower].append(
                        mention.getATTR("NE_CLASS"))

                    #sundance's semantic class
                    sun_semantic_class[text_lower].append(
                        mention.getATTR("SUN_SEMANTIC"))

                    number_gold_antecedents[text_lower].append(antecedents)

                    #get verb stats
                    if mention.getATTR("ROLE") == "SUBJ":
                        verb = reconcile.getSubjVerb(mention, pos)
                        if verb != None:
                            subj_verbs[text_lower].append(verb.lower())
                    elif mention.getATTR("ROLE") == "DOBJ":
                        verb = reconcile.getObjVerb(mention, pos)
                        if verb != None:
                            obj_verbs[text_lower].append(verb.lower())

                base_antecedent = False
                prev_annot = mention
                antecedents += 1
                continue

            if mention.getATTR("is_nominal"):
                text = mention.getText()
                text_lower = mention.getATTR("TEXT_CLEAN").lower()
                head_text = mention.getATTR("HEAD_TEXT")

                original_text[text_lower].append(text)
                virtual_pronouns[text_lower] = \
                virtual_pronouns.get(text_lower, 0) + 1

                virtual_pronoun_heads[head_text.lower()] = \
                virtual_pronoun_heads.get(head_text.lower(), 0) + 1

                #the semantic class Reconcile puts this in.
                reconcile_semantic_class[text_lower].append(
                    mention.getATTR("NE_CLASS"))

                #register this doc as containing this np.
                docs_appeared[text_lower].append(f)

                #take note that this chain contained this nominal
                nominal2chains[text_lower].append(chain)

                #take note of the gold semantic class
                gold_semantic_class[text_lower].append(
                    mention.getATTR("GOLD_SEMANTIC"))

                #the number of possible correct antecedents for this anaphor
                number_gold_antecedents[text_lower].append(antecedents)

                #sundance's semantic class
                sun_semantic_class[text_lower].append(
                    mention.getATTR("SUN_SEMANTIC"))

                # subject verb statistics
                if mention.getATTR("ROLE") == "SUBJ":
                    verb = reconcile.getSubjVerb(mention, pos)
                    subj_verbs[text_lower].append(verb.lower())
                elif mention.getATTR("ROLE") == "DOBJ":
                    verb = reconcile.getObjVerb(mention, pos)
                    obj_verbs[text_lower].append(verb.lower())

                #get the sentence distance from these two mentions.
                mention_sent = reconcile.getAnnotSentence(f, mention)
                prev_sent = reconcile.getAnnotSentence(f, prev_annot)

                if mention_sent > -1 and prev_sent > -1:
                    distance_from_antecedent[text_lower].append(mention_sent - \
                            prev_sent)

                #get the TextTiling segment distance for the two mentions
                mention_seg = doc.getAnnotTile(mention)
                prev_seg = doc.getAnnotTile(prev_annot)
                if mention_seg > -1 and prev_seg > -1:
                    focus_distance[text_lower].append(mention_seg - \
                            prev_seg)

                #getting the distribution of closest antecedent types for a
                #given nominal
                if prev_annot.getATTR("is_nominal"):
                    nominals2type[text_lower]["nominal"] = \
                    nominals2type[text_lower].get("nominal",0) + 1
                elif prev_annot.getATTR("is_pronoun"):
                    nominals2type[text_lower]["pronoun"] = \
                    nominals2type[text_lower].get("pronoun",0) + 1
                else:
                    nominals2type[text_lower]["proper"] = \
                    nominals2type[text_lower].get("proper",0) + 1
            prev_annot = mention
            antecedents += 1

    #for key in nominal2chains.keys():
    #    print "%d : %s (doc: %s)" % (len(list(set(nominal2chains[key]))), key,
    #            doc)

    #update the total counts.
    for key in list(original_text.keys()):
        for text in list(set(original_text[key])):
            total_counts[key] = total_counts.get(key,
                                                 0) + doc.getWordCounts(text)

    #the head counts
    for key in list(virtual_pronoun_heads.keys()):
        total_counts_heads[key] = total_counts_heads.get(key, 0) + \
        doc.getWordCounts(key)
示例#7
0
        "proper": {}
    }

    sys.stdout.flush()
    sys.stdout.write("\r")
    prog = ProgressBar(len(files))
    i = 0
    for f in files:
        if f.startswith("#"): continue
        f = f.strip()
        prog.update_time(i)
        sys.stdout.write("\r%s" % (str(prog)))
        sys.stdout.flush()
        i += 1

        doc = Document(f)
        #NOTE: still assuming that gold mentions are being supplied via
        #Reconcile.
        gold_nps = reconcile.getNPs(f)
        gold_chains = reconcile.getGoldChains(f)
        doc.addGoldChains(gold_chains)

        for np in gold_nps:
            text = utils.textClean(np.getText().lower()).strip()

            if text in data.THIRD_PERSON:
                #then it is he, him, she
                add_stats(noun_classes["third_person"], doc, np, text)
            elif (text in data.IT) and (text != "i"):
                #then we have 'it' or 'its'
                add_stats(noun_classes["it"], doc, np, text)