예제 #1
0
def getHead(text):
    """duplicates the head generation in java"""

    text = text.strip()

    #check if conjunction
    if utils.isConj(text):
        return utils.conjHead(text)

    tokens = text.split()
    new_text = ""
    first = True
    for word in tokens:
        if (utils.break_word(word) and not first):
            break

        if (word.endswith(",")):
            new_text += word[:-1]
            break

        #capture possessives?
        #if (word.endswith("'s"):
        #   new_text = ""
        #   continue

        new_text += word + " "
        first = False

    new_text = new_text.strip()
    if new_text == "":
        sys.stderr.write("Empty text: \"{0}\" : \"{1}\"".format(
            text, new_text))

    return new_text.split()[-1]
예제 #2
0
def add_stats(text, anaphor, doc, nouns, head2text):
    head = getHead(text)
    if head.endswith("%"): return  #skip percents
    if head[-1].isdigit(): return  #skip numbers
    if utils.isConj(head): return  #just skip these guys too
    if head == "himself": return  #NOTE for some reason, the filter doesn't
    #catch this, must be happening after head
    #noun is created.
    if head == "themselves": return

    anaphor_np = doc.nps.getAnnotBySpan(anaphor.getStart(), anaphor.getEnd())

    #update the head2text dict
    if text not in head2text[head]:
        head2text[head].append(text)
    #make sure the head nouns are reasonable
    #print "{0} => {1}".format(text, head)

    #then look for thangs
    if text not in list(nouns.keys()):
        nouns[text] = Nominal(text)
        nouns[text].updateDocs(doc.getName())
    else:
        nouns[text].updateCount()
        nouns[text].updateDocs(doc.getName())

    if anaphor_np["GRAMMAR"] == "SUBJECT":
        nouns[text].subj += 1
    elif anaphor_np["GRAMMAR"] == "OBJECT":
        nouns[text].dobj += 1

    antecedent = doc.closest_antecedent(anaphor)
    if antecedent is not None:
        #record stats
        sd = doc.sentence_distance(antecedent, anaphor)
        nouns[text].sentence_distance(sd)
        nouns[text].most_recent_antecedents.append(
            antecedent.getText().lower())

        antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(),
                                               antecedent.getEnd())
        if antecedent_np["GRAMMAR"] == "SUBJECT":
            nouns[text].subj_ante += 1
        elif antecedent_np["GRAMMAR"] == "OBJECT":
            nouns[text].dobj_ante += 1

        if antecedent.getText().lower() == anaphor.getText().lower():
            nouns[text].string_matches += 1

        if specificity_utils.isProper(antecedent_np):
            nouns[text].prp_ante += 1
        elif specificity_utils.isNominal(antecedent_np):
            nouns[text].nom_ante += 1
        elif specificity_utils.isPronoun(antecedent_np):
            nouns[text].pro_ante += 1

    else:
        #this guy starts the chain
        nouns[text].starts_chain += 1
예제 #3
0
def add_stats(text, head, anaphor, doc, nouns, head2text):

    #catches a problem with the following report
    if head == 'the':
        head = text.split()[-1]

    if head.endswith("%"): return  #skip percents
    if head[-1].isdigit(): return  #skip numbers
    if utils.isConj(head): return  #just skip these guys too
    if head == "himself": return  #NOTE for some reason, the filter doesn't
    #catch this, must be happening after head
    #noun is created.
    if head == "themselves": return
    if head == "head": return
    if head == "where": return
    if head == "there": return
    if head == "here": return

    anaphor_np = doc.nps.getAnnotBySpan(anaphor.getStart(), anaphor.getEnd())

    #update the head2text dict
    if text not in head2text[head]:
        head2text[head].append(text)
    #make sure the head nouns are reasonable
    #print "{0} => {1}".format(text, head)

    #then look for thangs
    if text not in list(nouns.keys()):
        nouns[text] = VirtualPronoun(text)
        nouns[text].updateDocs(doc.getName())
    else:
        nouns[text].updateCount()
        nouns[text].updateDocs(doc.getName())

    if anaphor_np["GRAMMAR"] == "SUBJECT":
        nouns[text].subj += 1
    elif anaphor_np["GRAMMAR"] == "OBJECT":
        nouns[text].dobj += 1

    #begin modifier code
    definite = "the {0}".format(head)
    indefinite1 = "a {0}".format(head)
    indefinite2 = "an {0}".format(head)

    #pos = reconcile.getPOS(doc.getName())
    #head_index = specificity_utils.getHeadIndex(anaphor_np, head)
    #np_pos = pos.getSubset(anaphor.getStart(), anaphor.getEnd())
    #np_words = text.split()
    if text.startswith(definite):
        nouns[text].bare_definite += 1
    #elif text.startswith(indefinite1) or text.startswith(indefinite2):
    #nouns[text].indefinite += 1
    #else:
    ##NOTE: just checking to see if there is some kind of modification now
    #if len(np_pos) == len(np_words):
    ##sys.stderr.write("Mismatch tag and word length: {0} => {1}\n".format(np_pos.getList(), np_words))
    #for i in range(0, head_index):
    #if np_pos[i]["TAG"] == "DT":
    #continue
    #elif np_pos[i]["TAG"] == "JJ":
    ##print "Adjective: {0}".format(np_words[i])
    #nouns[text].adjective_modifiers.append(np_words[i])
    #elif np_pos[i]["TAG"].startswith("N"):
    ##print "Noun: {0} {1}".format(np_words[i], np_pos[i]["TAG"])
    #if np_pos[i]["TAG"].startswith("NNP"):
    #nouns[text].proper_modifiers.append(np_words[i])
    #else:
    #nouns[text].common_modifiers.append(np_words[i])
    #else:
    ##print "?: {0}".format(np_words[i])
    #nouns[text].other_modifiers.append(np_words[i])

    #if text.startswith("the "):
    #get parts of speech for the np:
    #else:
    ##not definite, but still modified
    #if len(np_pos) == len(np_words):
    ##sys.stderr.write("Mismatch tag and word length: {0} => {1}\n".format(np_pos.getList(), np_words))
    #continue

    #for i in range(0, head_index):
    #if np_pos[i]["TAG"] == "DT":
    #continue
    #elif np_pos[i]["TAG"] == "JJ":
    ##print "Adjective: {0}".format(np_words[i])
    #nouns[text].adjective_modifiers.append(np_words[i])
    #elif np_pos[i]["TAG"].startswith("N"):
    ##print "Noun: {0} {1}".format(np_words[i], np_pos[i]["TAG"])
    #if np_pos[i]["TAG"].startswith("NNP"):
    #nouns[text].proper_modifiers.append(np_words[i])
    #else:
    #nouns[text].common_modifiers.append(np_words[i])
    #else:
    ##print "?: {0}".format(np_words[i])
    #nouns[text].other_modifiers.append(np_words[i])

    #capture post modifiers
    #if text.find(head + " of ") > -1:
    #of_start = text.find(head + " of ")
    #of_object = text[len(head) + of_start + 3:]
    #nouns[text].of_attachments.append(of_object.strip())

    #if text.find(head + " on ") > -1:
    #of_start = text.find(head + " on ")
    #of_object = text[len(head) + of_start + 3:]
    #nouns[text].on_attachments.append(of_object.strip())

    #if text.find(head + " that ") > -1:
    #that_start = text.find(head + " that ")
    #that_clause = text[len(head) + that_start+5:]
    #nouns[text].that_attachments.append(that_clause.strip())

    #if text.find(head + " with ") > -1:
    #that_start = text.find(head + " with ")
    #that_clause = text[len(head) + that_start+5:]
    #nouns[text].with_attachments.append(that_clause.strip())

    #if text.find(head + " by ") > -1:
    #by_start = text.find(head + " by ")
    #by_object = text[len(head) + by_start+3:]
    #nouns[text].by_attachments.append(by_object.strip())

    #if text.find(head + " which ") > -1:
    #which_start = text.find(head + " which ")
    #which_clause = text[len(head) + which_start+6:]
    #nouns[text].which_attachments.append(which_clause.strip())

    #if len(np_pos) >= head_index+2 and len(np_words) >= head_index+2:
    #if np_pos[head_index+1]["TAG"] == "VBD":
    #nouns[text].verbed.append(np_words[head_index+1])

    #if np_pos[head_index+1]["TAG"] == "VBG":
    #nouns[text].verbing.append(np_words[head_index+1])
    #end modifier code

    #find which chain the anaphor is from and add the chain statistics
    anaphor_chain = None
    for chain in list(doc.gold_chains.keys()):
        for mention in doc.gold_chains[chain]:
            if anaphor == mention:
                anaphor_chain = chain
                break

    chain_name = "{0}:{1}".format(doc.getName(), anaphor_chain)
    if chain_name not in nouns[text].chains:
        nouns[text].chains.append(chain_name)

    if anaphor_chain is not None:
        chain_length = len(doc.gold_chains[anaphor_chain])
        nouns[text].chain_size[doc.getName()] = chain_length

        #coverage
        #chain_start = doc.gold_chains[chain][0].getStart()
        #chain_end   = doc.gold_chains[chain][-1].getEnd()
        #chain_size  = chain_end - chain_start
        #chain_coverage = float(chain_size) / len(doc.text)

        # number of sentences touched / number of sentences
        covered_sentences = 0
        for sent in doc.sentences:
            for mention in doc.gold_chains[anaphor_chain]:
                if sent.contains(mention):
                    covered_sentences += 1
                    break

        chain_coverage = float(covered_sentences) / len(doc.sentences)
        nouns[text].chain_coverage[doc.getName()] = chain_coverage

        for chain in list(doc.gold_chains.keys()):
            if chain == anaphor_chain:
                continue
            if len(doc.gold_chains[chain]) > chain_length:
                break
        else:
            nouns[text].largest_chain += 1

        common_only = True
        for mention in doc.gold_chains[anaphor_chain]:
            if mention == anaphor:
                continue
            mention_head = getHead(utils.textClean(mention.getText()))
            if mention_head not in nouns[text].all_entities:
                nouns[text].all_entities.append(mention_head)

            #does this chain contain proper names?
            mention_np = doc.nps.getAnnotBySpan(mention.getStart(),
                                                mention.getEnd())
            if specificity_utils.isProper(mention_np):
                common_only = False

        if chain_name not in list(nouns[text].nom_chain_only.keys()):
            nouns[text].nom_chain_only[chain_name] = common_only
    else:
        sys.stderr.write("Anaphor chain not found?\n")

    antecedent = doc.closest_antecedent(anaphor)
    if antecedent is not None:
        #record stats
        sd = doc.sentence_distance(antecedent, anaphor)
        nouns[text].sentence_distance(sd)
        nouns[text].most_recent_antecedents.append(
            antecedent.getText().lower())

        antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(),
                                               antecedent.getEnd())
        if antecedent_np["GRAMMAR"] == "SUBJECT":
            nouns[text].subj_ante += 1
        elif antecedent_np["GRAMMAR"] == "OBJECT":
            nouns[text].dobj_ante += 1

        if antecedent.getText().lower() == anaphor.getText().lower():
            nouns[text].string_matches += 1

        if specificity_utils.isProper(antecedent_np):
            nouns[text].prp_ante += 1
        elif specificity_utils.isNominal(antecedent_np):
            nouns[text].nom_ante += 1
        elif specificity_utils.isPronoun(antecedent_np):
            nouns[text].pro_ante += 1
    else:
        #this guy starts the chain
        nouns[text].starts_chain += 1
예제 #4
0
    sys.stderr.flush()
    sys.stderr.write("\r")
    prog = ProgressBar(len(files))
    j = 0
    for f in files:
        prog.update_time(j)
        sys.stderr.write("\r%s" % (str(prog)))
        sys.stderr.flush()
        j += 1

        nps = reconcile.getNPs(f)
        pos = reconcile.getPOS(f)
        for np in nps:
            if specificity_utils.isNominal(np):
                np_text = utils.textClean(np.getText()).lower()
                if utils.isConj(np_text): continue

                np_head = specificity_utils.getHead(np_text).lower()
                head_index = specificity_utils.getHeadIndex(np, np_head)
                np_pos = pos.getSubset(np.getStart(), np.getEnd())
                np_words = np_text.split()
                #print "{0:35} -> {1:15}".format(np_text, np_head)

                if np_head not in list(head2nouns.keys()):
                    head2nouns[np_head] = Noun(np_head)
                    head2nouns[np_head].docs.append(f)
                else:
                    head2nouns[np_head].count += 1
                    head2nouns[np_head].docs.append(f)

                if np["GRAMMAR"] == "SUBJECT":