def add_stats(text, anaphor, doc, nouns, head2text): head = getHead(text) if head.endswith("%"): return #skip percents if head[-1].isdigit(): return #skip numbers if utils.isConj(head): return #just skip these guys too if head == "himself": return #NOTE for some reason, the filter doesn't #catch this, must be happening after head #noun is created. if head == "themselves": return anaphor_np = doc.nps.getAnnotBySpan(anaphor.getStart(), anaphor.getEnd()) #update the head2text dict if text not in head2text[head]: head2text[head].append(text) #make sure the head nouns are reasonable #print "{0} => {1}".format(text, head) #then look for thangs if text not in list(nouns.keys()): nouns[text] = Nominal(text) nouns[text].updateDocs(doc.getName()) else: nouns[text].updateCount() nouns[text].updateDocs(doc.getName()) if anaphor_np["GRAMMAR"] == "SUBJECT": nouns[text].subj += 1 elif anaphor_np["GRAMMAR"] == "OBJECT": nouns[text].dobj += 1 antecedent = doc.closest_antecedent(anaphor) if antecedent is not None: #record stats sd = doc.sentence_distance(antecedent, anaphor) nouns[text].sentence_distance(sd) nouns[text].most_recent_antecedents.append( antecedent.getText().lower()) antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(), antecedent.getEnd()) if antecedent_np["GRAMMAR"] == "SUBJECT": nouns[text].subj_ante += 1 elif antecedent_np["GRAMMAR"] == "OBJECT": nouns[text].dobj_ante += 1 if antecedent.getText().lower() == anaphor.getText().lower(): nouns[text].string_matches += 1 if specificity_utils.isProper(antecedent_np): nouns[text].prp_ante += 1 elif specificity_utils.isNominal(antecedent_np): nouns[text].nom_ante += 1 elif specificity_utils.isPronoun(antecedent_np): nouns[text].pro_ante += 1 else: #this guy starts the chain nouns[text].starts_chain += 1
def add_stats(text, head, anaphor, doc, nouns, head2text): #catches a problem with the following report if head == 'the': head = text.split()[-1] if head.endswith("%"): return #skip percents if head[-1].isdigit(): return #skip numbers if utils.isConj(head): return #just skip these guys too if head == "himself": return #NOTE for some reason, the filter doesn't #catch this, must be happening after head #noun is created. if head == "themselves": return if head == "head": return if head == "where": return if head == "there": return if head == "here": return anaphor_np = doc.nps.getAnnotBySpan(anaphor.getStart(), anaphor.getEnd()) #update the head2text dict if text not in head2text[head]: head2text[head].append(text) #make sure the head nouns are reasonable #print "{0} => {1}".format(text, head) #then look for thangs if text not in list(nouns.keys()): nouns[text] = VirtualPronoun(text) nouns[text].updateDocs(doc.getName()) else: nouns[text].updateCount() nouns[text].updateDocs(doc.getName()) if anaphor_np["GRAMMAR"] == "SUBJECT": nouns[text].subj += 1 elif anaphor_np["GRAMMAR"] == "OBJECT": nouns[text].dobj += 1 #begin modifier code definite = "the {0}".format(head) indefinite1 = "a {0}".format(head) indefinite2 = "an {0}".format(head) #pos = reconcile.getPOS(doc.getName()) #head_index = specificity_utils.getHeadIndex(anaphor_np, head) #np_pos = pos.getSubset(anaphor.getStart(), anaphor.getEnd()) #np_words = text.split() if text.startswith(definite): nouns[text].bare_definite += 1 #elif text.startswith(indefinite1) or text.startswith(indefinite2): #nouns[text].indefinite += 1 #else: ##NOTE: just checking to see if there is some kind of modification now #if len(np_pos) == len(np_words): ##sys.stderr.write("Mismatch tag and word length: {0} => {1}\n".format(np_pos.getList(), np_words)) #for i in range(0, head_index): #if np_pos[i]["TAG"] == "DT": #continue #elif np_pos[i]["TAG"] == "JJ": ##print "Adjective: {0}".format(np_words[i]) #nouns[text].adjective_modifiers.append(np_words[i]) #elif np_pos[i]["TAG"].startswith("N"): ##print "Noun: {0} {1}".format(np_words[i], np_pos[i]["TAG"]) #if np_pos[i]["TAG"].startswith("NNP"): #nouns[text].proper_modifiers.append(np_words[i]) #else: #nouns[text].common_modifiers.append(np_words[i]) #else: ##print "?: {0}".format(np_words[i]) #nouns[text].other_modifiers.append(np_words[i]) #if text.startswith("the "): #get parts of speech for the np: #else: ##not definite, but still modified #if len(np_pos) == len(np_words): ##sys.stderr.write("Mismatch tag and word length: {0} => {1}\n".format(np_pos.getList(), np_words)) #continue #for i in range(0, head_index): #if np_pos[i]["TAG"] == "DT": #continue #elif np_pos[i]["TAG"] == "JJ": ##print "Adjective: {0}".format(np_words[i]) #nouns[text].adjective_modifiers.append(np_words[i]) #elif np_pos[i]["TAG"].startswith("N"): ##print "Noun: {0} {1}".format(np_words[i], np_pos[i]["TAG"]) #if np_pos[i]["TAG"].startswith("NNP"): #nouns[text].proper_modifiers.append(np_words[i]) #else: #nouns[text].common_modifiers.append(np_words[i]) #else: ##print "?: {0}".format(np_words[i]) #nouns[text].other_modifiers.append(np_words[i]) #capture post modifiers #if text.find(head + " of ") > -1: #of_start = text.find(head + " of ") #of_object = text[len(head) + of_start + 3:] #nouns[text].of_attachments.append(of_object.strip()) #if text.find(head + " on ") > -1: #of_start = text.find(head + " on ") #of_object = text[len(head) + of_start + 3:] #nouns[text].on_attachments.append(of_object.strip()) #if text.find(head + " that ") > -1: #that_start = text.find(head + " that ") #that_clause = text[len(head) + that_start+5:] #nouns[text].that_attachments.append(that_clause.strip()) #if text.find(head + " with ") > -1: #that_start = text.find(head + " with ") #that_clause = text[len(head) + that_start+5:] #nouns[text].with_attachments.append(that_clause.strip()) #if text.find(head + " by ") > -1: #by_start = text.find(head + " by ") #by_object = text[len(head) + by_start+3:] #nouns[text].by_attachments.append(by_object.strip()) #if text.find(head + " which ") > -1: #which_start = text.find(head + " which ") #which_clause = text[len(head) + which_start+6:] #nouns[text].which_attachments.append(which_clause.strip()) #if len(np_pos) >= head_index+2 and len(np_words) >= head_index+2: #if np_pos[head_index+1]["TAG"] == "VBD": #nouns[text].verbed.append(np_words[head_index+1]) #if np_pos[head_index+1]["TAG"] == "VBG": #nouns[text].verbing.append(np_words[head_index+1]) #end modifier code #find which chain the anaphor is from and add the chain statistics anaphor_chain = None for chain in list(doc.gold_chains.keys()): for mention in doc.gold_chains[chain]: if anaphor == mention: anaphor_chain = chain break chain_name = "{0}:{1}".format(doc.getName(), anaphor_chain) if chain_name not in nouns[text].chains: nouns[text].chains.append(chain_name) if anaphor_chain is not None: chain_length = len(doc.gold_chains[anaphor_chain]) nouns[text].chain_size[doc.getName()] = chain_length #coverage #chain_start = doc.gold_chains[chain][0].getStart() #chain_end = doc.gold_chains[chain][-1].getEnd() #chain_size = chain_end - chain_start #chain_coverage = float(chain_size) / len(doc.text) # number of sentences touched / number of sentences covered_sentences = 0 for sent in doc.sentences: for mention in doc.gold_chains[anaphor_chain]: if sent.contains(mention): covered_sentences += 1 break chain_coverage = float(covered_sentences) / len(doc.sentences) nouns[text].chain_coverage[doc.getName()] = chain_coverage for chain in list(doc.gold_chains.keys()): if chain == anaphor_chain: continue if len(doc.gold_chains[chain]) > chain_length: break else: nouns[text].largest_chain += 1 common_only = True for mention in doc.gold_chains[anaphor_chain]: if mention == anaphor: continue mention_head = getHead(utils.textClean(mention.getText())) if mention_head not in nouns[text].all_entities: nouns[text].all_entities.append(mention_head) #does this chain contain proper names? mention_np = doc.nps.getAnnotBySpan(mention.getStart(), mention.getEnd()) if specificity_utils.isProper(mention_np): common_only = False if chain_name not in list(nouns[text].nom_chain_only.keys()): nouns[text].nom_chain_only[chain_name] = common_only else: sys.stderr.write("Anaphor chain not found?\n") antecedent = doc.closest_antecedent(anaphor) if antecedent is not None: #record stats sd = doc.sentence_distance(antecedent, anaphor) nouns[text].sentence_distance(sd) nouns[text].most_recent_antecedents.append( antecedent.getText().lower()) antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(), antecedent.getEnd()) if antecedent_np["GRAMMAR"] == "SUBJECT": nouns[text].subj_ante += 1 elif antecedent_np["GRAMMAR"] == "OBJECT": nouns[text].dobj_ante += 1 if antecedent.getText().lower() == anaphor.getText().lower(): nouns[text].string_matches += 1 if specificity_utils.isProper(antecedent_np): nouns[text].prp_ante += 1 elif specificity_utils.isNominal(antecedent_np): nouns[text].nom_ante += 1 elif specificity_utils.isPronoun(antecedent_np): nouns[text].pro_ante += 1 else: #this guy starts the chain nouns[text].starts_chain += 1
def add_stats(pronoun_class, doc, anaphor, text): if text in list(pronoun_class.keys()): pronoun_class[text].updateCount() else: pronoun_class[text] = Pronoun(text) #find the closest antecedent antecedent = doc.closest_antecedent(anaphor) if antecedent is not None: #print anaphor.ppprint(), #print antecedent.ppprint() #uniqueness -- what is the rate at which a pronoun is coreferent #with "new" words? I once called this generality -- captured with #antecedent pronoun_class[text].addAntecedent( utils.textClean(antecedent.getText()).lower()) #string matches -- how often does this pronoun resolve to #instances of itself? ant_text = utils.textClean(antecedent.getText()).lower() if ant_text == text \ or (ant_text in ("he", "him") and text in ("he", "him")) \ or (ant_text in ("they", "them") and text in ("they", "them")): pronoun_class[text].string_matches += 1 #find the distance of the closest antecedent # 1. in word wd = doc.word_distance(antecedent, anaphor) pronoun_class[text].word_distance(antecedent, wd) # 2. in sentences sd = doc.sentence_distance(antecedent, anaphor) pronoun_class[text].sent_distance(antecedent, sd) #ant_pdtb = doc.getContainedPDTB(antecedent) #ana_pdtb = doc.getContainedPDTB(anaphor) # 3. pdtb parse distance ? what discourse parse values are useful? #for pdtb1 in ant_pdtb: # for pdtb2 in ana_pdtb: # if pdtb1 == pdtb2: # # a. if the anaphor and antecedent are in the same argument of a # # discourse relation? # pronoun_class[text].pdtb["SAME_ARG"] = pronoun_class[text].pdtb["SAME_ARG"] + 1 # # if (pdtb1.getATTR("TYPE") == pdtb2.getATTR("TYPE")) and (pdtb1.getATTR("SID") == pdtb2.getATTR("SID")): # # b. if the anaphor and antecedent are in different arguments of the # # same discourse relation # pronoun_class[text].pdtb["DIFF_ARG"] = pronoun_class[text].pdtb["DIFF_ARG"] + 1 #else: ## c. if the anaphor and antecedent are not in the same discourse ## relation at all # pronoun_class[text].pdtb["NONE"] = pronoun_class[text].pdtb["NONE"] + 1 #how often is this pronoun coreferent with a nominal? if specificity_utils.isNominal(antecedent): pronoun_class[text].nominal_antecedent += 1 #how often is this pronoun coreferent with a proper name? antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(), antecedent.getEnd()) if antecedent_np.getATTR("contains_pn") is not None: if antecedent_np.getATTR("contains_pn") == antecedent.getText(): pronoun_class[text].proper_antecedent += 1 elif specificity_utils.isProper(antecedent_np): pronoun_class[text].proper_antecedent += 1 #how often are antecedents of this pronoun in the subj or dobj #position of a verb? if antecedent_np["GRAMMAR"] == "SUBJECT": pronoun_class[text].subj += 1 elif antecedent_np["GRAMMAR"] == "OBJECT": pronoun_class[text].dobj += 1 else: pronoun_class[text].starts_chain()
# Creation Date : 08-27-2013 # Last Modified : Wed 28 Aug 2013 03:08:42 PM MDT # Created By : Nathan Gilbert # import sys import en from pyconcile import reconcile from pyconcile import utils import specificity_utils if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: %s <dir>" % (sys.argv[0])) sys.exit(1) #read in the nps nps = reconcile.getNPs(sys.argv[1]) for np in nps: if specificity_utils.isProper(np) or specificity_utils.isPronoun(np): continue head = specificity_utils.getHead(utils.textClean(np.getText())) print("{0} => {1}".format(np.pprint(), head)) print(en.noun.senses(head)) print(en.noun.hypernyms(head, sense=0)) print("=" * 30) #read in named entities and/or read the #fire up wordnet
add_stats(noun_classes["it"], doc, np, text) elif text in data.THIRD_PERSON_PLURAL: #we have 'they' or 'them' add_stats(noun_classes["third_person_plural"], doc, np, text) elif text in data.THIRD_SINGULAR_POSSESSIVES: add_stats(noun_classes["third_single_possessive"], doc, np, text) elif text in data.THIRD_PLURAL_POSSESSIVES: add_stats(noun_classes["third_plural_possessive"], doc, np, text) elif text in data.IT_POSSESSIVE: add_stats(noun_classes["it_possessive"], doc, np, text) elif specificity_utils.isNominal(np): add_stats(noun_classes["nominal"], doc, np, text) #sys.stderr.write("{0}\n".format(text)) elif specificity_utils.isProper(np): add_stats(noun_classes["proper"], doc, np, text) #sys.stderr.write("{0}\n".format(text)) else: #sys.stderr.write("Word not found: {0}\n".format(text)) continue #true singletons -- TODO double check that these numbers are correct #this word exists outside of annotations #This needs to take place at the document level and cycle over all the for cls in list(noun_classes.keys()): for word in noun_classes[cls]: noun_classes[cls][word].singletons += doc.getSingletonCount( word) sys.stdout.write("\r \r\n")