def processACE(f, np, heads2qp): ace_annots = reconcile.parseGoldAnnots(f) stanford_deps = reconcile.getStanfordDep(f) gold_chains = reconcile.getGoldChains(f) ace_np = ace_annots.getAnnotBySpan(np.getStart(), np.getEnd()) if ace_np["is_nominal"]: head = utils.textClean(ace_np["HEAD"].strip().lower()) text = utils.textClean(np.getText()) #bookkeeping if head not in list(heads2qp.keys()): heads2qp[head] = QuasiPronoun(head) else: heads2qp[head].updateDocs(f) heads2qp[head].updateCount() if ace_np["GOLD_SINGLETON"]: heads2qp[head].singelton += 1 else: #does it start the chain? for gc in list(gold_chains.keys()): if gold_chains[gc][0] == np: heads2qp[head].starts_chain += 1 break process(f, np, head, text, heads2qp, stanford_deps)
def isProper(annot): global PROPER_NOUNS if len(PROPER_NOUNS) < 1: read_in_propers() text = utils.textClean(annot.getText()).lower().strip() if text in PROPER_NOUNS: return True if annot["PROPER_NAME"] == "true": return True if annot["PROPER_NOUN"] == "true": return True if text.startswith("mr."): return True if text.startswith("ms."): return True if text.startswith("mrs."): return True if text.endswith("corp."): return True if text.endswith("co."): return True if text.endswith("ltd."): return True if text.endswith("inc."): return True if text.endswith("ag"): return True if text.endswith("plc"): return True return False
def getHeadSpan(annot, head): head = head.replace("(", "").replace(")", "") match = re.compile(r'\b({0})\b'.format(head), flags=re.IGNORECASE).search( utils.textClean(annot.getText())) if match: return (match.start(1) + annot.getStart(), match.end(1) + annot.getStart()) return None
def isPronoun(annot): text = utils.textClean(annot.getText()).lower().strip() if text in data.ALL_PRONOUNS: return True if text in ("here", "there", "then", "those"): return True return False
def getHeadIndex(annot, head): annot_text = utils.textClean(annot.getText()).lower().strip() tokens = annot_text.split() i = 0 for tok in tokens: if tok == head: return i i += 1 return -1
def getHeadSpan(annot, head): #NOTE: texts with parenths have problems if annot.getText().find("(") > -1 or annot.getText().find(")") > -1: return None match = re.compile(r'\b({0})\b'.format(head), flags=re.IGNORECASE).search( utils.textClean(annot.getText())) if match: return (match.start(1), match.end(1) - 1) return None
def addDefinite(self, orig): orig_text = utils.textClean(orig.getText()) if orig_text.startswith("the "): self.definite["definite"] = self.definite.get("definite", 0) + 1 elif orig_text.startswith("a ") or orig_text.startswith("an "): self.definite["indefinite"] = self.definite.get("indefinite", 0) + 1 else: if orig_text == self.head: self.definite["bare"] = self.definite.get("bare", 0) + 1
def processACE(f, head2qp): global USE_GOLD ace_annots = reconcile.parseGoldAnnots(f) nps = reconcile.getNPs(f) stanford_deps = reconcile.getStanfordDep(f) gold_chains = reconcile.getGoldChains(f) for np in nps: ace_np = ace_annots.getAnnotBySpan(np.getStart(), np.getEnd()) head = None text = None if PRONOUNS: if qp_utils.isPronoun(np): head = ace_np["HEAD"].lower() text = np.getText() else: continue else: if ace_np["is_nominal"]: head = utils.textClean(ace_np["HEAD"].strip().lower()) text = utils.textClean(np.getText()) else: continue #bookkeeping if head not in list(head2qp.keys()): head2qp[head] = QuasiPronoun(head) else: head2qp[head].updateDocs(f) head2qp[head].updateCount() if ace_np["GOLD_SINGLETON"]: head2qp[head].singleton += 1 if (text.startswith("a ") or text.startswith("an ")): head2qp[head].faux_ba += 1 else: #does it start the chain? if USE_GOLD: process_gold(f, np, head, text, head2qp, gold_chains) process_syntax(f, np, head, text, head2qp, stanford_deps)
def collectACEFPs(ace_annots, this_files_common_nouns): for gold_np in ace_annots: if gold_np is not None: #if not gold_np["GOLD_SINGLETON"] and np["GRAMMAR"] == "SUBJECT": if not gold_np["GOLD_SINGLETON"] and gold_np["is_nominal"]: gold_text = utils.textClean(gold_np.getText()).lower().strip() gold_head = gold_np["HEAD"].lower().strip() #definites + demonstratives if gold_text in ("the " + gold_head, "that " + gold_head, "this " + gold_head, "these " + gold_head, "those " + gold_head): this_files_common_nouns.append(gold_np) else: print("couldn't find {0} in the gold".format(np))
def isProper(annot, pos): global PROPER_NOUNS if len(PROPER_NOUNS) < 1: read_in_propers() global COMMON_NOUNS if len(COMMON_NOUNS) < 1: read_in_commons() text = utils.textClean(annot.getText()).lower() if text in PROPER_NOUNS: return True if text in COMMON_NOUNS: return False if isPronoun(annot): return False if annot["DATE"] != "NONE": return False if text.find("http://") > -1 or text.find("www.") > -1: return False if annot["PROPER_NAME"] == "true" or annot["PROPER_NOUN"] == "true": return True tags = pos.getSubset(annot.getStart(), annot.getEnd()) head = getHead2(text, tags) if isNumber(head): return False if head.endswith("%") or head.startswith("$"): return False if head in ("million", "billion", "cents", "dollars"): return False head = head.replace("\"","") head_span = getHeadSpan(annot, head) if head_span is None: head_span = getHeadSpan(annot, head.replace("]","")) pos_tag = pos.getAnnotBySpan(head_span[0], head_span[1]) if pos_tag is None: return False if pos_tag["TAG"].startswith("NNP"): return True return False
def isNominal(annot): """ return True if the annotation is a nominal false otherwise """ global COMMON_NOUNS if len(COMMON_NOUNS) < 1: read_in_commons() text = utils.textClean(annot.getText()).lower() if text in COMMON_NOUNS: return True if text.endswith("%"): return False if not isProper(annot) and text not in data.ALL_PRONOUNS and annot[ "DATE"] == "NONE": return True return False
total_scores = {"vps_guessed": 0, "vps_correct": 0} RESPONSE_TYPE = "" for f in files: f = f.strip() print("Working on file: {0}".format(f)) gold_chains = reconcile.getGoldChains(f) pos = reconcile.getPOS(f) pairs = [] if "-hobbs" in sys.argv: RESPONSE_TYPE = "Hobbs" #read in the hobbs annotations hobbs_pairs = reconcile.getProResPairs(f, "hobbs") for pair in hobbs_pairs: tags = pos.getSubset(pair[1].getStart(), pair[1].getEnd()) text = utils.textClean(pair[1].getText()).lower() ana_head = qp_utils.getHead2(text, tags) print("{0:40} => {1}".format(text, ana_head)) if ana_head in QPs: pairs.append(pair) #if pair[1].getText().lower() not in data.ALL_PRONOUNS: # pairs.append(pair) elif "-rec" in sys.argv: #TODO if we choose this route then there needs to be some mods #since each vp can be resolved multiple times. # 1. only count the closest antecedent? # 2. don't count string matches? # 3. look at what is in the "pro_antes" property (that gives us the # Cogniac decision.
with open(sys.argv[1], 'r') as fileList: files.extend( [x for x in fileList.readlines() if not x.startswith("#")]) for f in files: f = f.strip() print("Working on file: {0}".format(f)) allNPs = {} nps = reconcile.getNPs(f) pos = reconcile.getPOS(f) stanford_nes = reconcile.getNEs(f) sundance_nes = reconcile.getSundanceNEs(f) for np in nps: key = "{0},{1}".format(np.getStart(), np.getEnd()) text = utils.textClean(np.getText().replace("\n", " ")).lower() tokens = text.split() if len(tokens) == 1: continue if key not in list(allNPs.keys()): allNPs[key] = NP(np.getText().replace("\n", " ")) allNPs[key].start = np.getStart() allNPs[key].end = np.getEnd() head = getHead(text) head_span = getHeadSpan(np, head) if head_span is not None: allNPs[key].addHead(head_span) #check to see if the head is contain in a proper name
#set up for all commons #ACE_HEADS.append(line) for f in files: f = f.strip() print("Working on file: {0}".format(f)) this_files_common_nouns = [] if ACE: tokens = reconcile.getTokens(f) pos = reconcile.getPOS(f) ace_annots = reconcile.parseGoldAnnots(f) this_files_common_nouns_orig = [] collectACEFPs(ace_annots, this_files_common_nouns_orig) #remove post modded commons for fp in this_files_common_nouns_orig: if not checkForModification(fp, tokens, pos): this_files_common_nouns.append(fp) else: gold_nps = reconcile.getNPs(f) collectFPs(gold_nps, this_files_common_nouns) #output common nouns to file i = 0 with open(f + "/annotations/faux_pronouns", 'w') as outFile: for annot in this_files_common_nouns: outFile.write("{0}\t{1},{2}\t{3}\t\n".format( i, annot.getStart(), annot.getEnd(), utils.textClean(annot.getText().lower()))) i += 1
def add_stats(pronoun_class, doc, anaphor, text): if text in list(pronoun_class.keys()): pronoun_class[text].updateCount() else: pronoun_class[text] = Pronoun(text) #find the closest antecedent antecedent = doc.closest_antecedent(anaphor) if antecedent is not None: #print anaphor.ppprint(), #print antecedent.ppprint() #uniqueness -- what is the rate at which a pronoun is coreferent #with "new" words? I once called this generality -- captured with #antecedent pronoun_class[text].addAntecedent( utils.textClean(antecedent.getText()).lower()) #string matches -- how often does this pronoun resolve to #instances of itself? ant_text = utils.textClean(antecedent.getText()).lower() if ant_text == text \ or (ant_text in ("he", "him") and text in ("he", "him")) \ or (ant_text in ("they", "them") and text in ("they", "them")): pronoun_class[text].string_matches += 1 #find the distance of the closest antecedent # 1. in word wd = doc.word_distance(antecedent, anaphor) pronoun_class[text].word_distance(antecedent, wd) # 2. in sentences sd = doc.sentence_distance(antecedent, anaphor) pronoun_class[text].sent_distance(antecedent, sd) #ant_pdtb = doc.getContainedPDTB(antecedent) #ana_pdtb = doc.getContainedPDTB(anaphor) # 3. pdtb parse distance ? what discourse parse values are useful? #for pdtb1 in ant_pdtb: # for pdtb2 in ana_pdtb: # if pdtb1 == pdtb2: # # a. if the anaphor and antecedent are in the same argument of a # # discourse relation? # pronoun_class[text].pdtb["SAME_ARG"] = pronoun_class[text].pdtb["SAME_ARG"] + 1 # # if (pdtb1.getATTR("TYPE") == pdtb2.getATTR("TYPE")) and (pdtb1.getATTR("SID") == pdtb2.getATTR("SID")): # # b. if the anaphor and antecedent are in different arguments of the # # same discourse relation # pronoun_class[text].pdtb["DIFF_ARG"] = pronoun_class[text].pdtb["DIFF_ARG"] + 1 #else: ## c. if the anaphor and antecedent are not in the same discourse ## relation at all # pronoun_class[text].pdtb["NONE"] = pronoun_class[text].pdtb["NONE"] + 1 #how often is this pronoun coreferent with a nominal? if specificity_utils.isNominal(antecedent): pronoun_class[text].nominal_antecedent += 1 #how often is this pronoun coreferent with a proper name? antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(), antecedent.getEnd()) if antecedent_np.getATTR("contains_pn") is not None: if antecedent_np.getATTR("contains_pn") == antecedent.getText(): pronoun_class[text].proper_antecedent += 1 elif specificity_utils.isProper(antecedent_np): pronoun_class[text].proper_antecedent += 1 #how often are antecedents of this pronoun in the subj or dobj #position of a verb? if antecedent_np["GRAMMAR"] == "SUBJECT": pronoun_class[text].subj += 1 elif antecedent_np["GRAMMAR"] == "OBJECT": pronoun_class[text].dobj += 1 else: pronoun_class[text].starts_chain()
total_scores = {"vps_guessed" : 0, "vps_correct" : 0 } RESPONSE_TYPE = "" for f in files: f=f.strip() print("Working on file: {0}".format(f)) gold_chains = reconcile.getGoldChains(f) pairs = [] if "-hobbs" in sys.argv: RESPONSE_TYPE = "Hobbs" #read in the hobbs annotations hobbs_pairs = reconcile.getProResPairs(f, "hobbs") for pair in hobbs_pairs: ana_head = specificity_utils.getHead(utils.textClean(pair[1].getText())).lower() if ana_head in VPs: pairs.append(pair) elif "-rec" in sys.argv: #TODO if we choose this route then there needs to be some mods #since each vp can be resolved multiple times. # 1. only count the closest antecedent? # 2. don't count string matches? # 3. look at what is in the "pro_antes" property (that gives us the # Cogniac decision. # 4. take the average accuracy for each noun. RESPONSE_TYPE = "Reconcile" #predictions = "features.goldnps/predictions.DecisionTree.muc6_DecisionTree_goldnps" predictions = "features.goldnps-vps/predictions.DecisionTree.muc6_DecisionTree_goldnps-vps" pairs = reconcile.getResponsePairs(f, predictions) for pair in reconcile_pairs:
def add_stats(text, head, anaphor, doc, nouns, head2text): #catches a problem with the following report if head == 'the': head = text.split()[-1] if head.endswith("%"): return #skip percents if head[-1].isdigit(): return #skip numbers if utils.isConj(head): return #just skip these guys too if head == "himself": return #NOTE for some reason, the filter doesn't #catch this, must be happening after head #noun is created. if head == "themselves": return if head == "head": return if head == "where": return if head == "there": return if head == "here": return anaphor_np = doc.nps.getAnnotBySpan(anaphor.getStart(), anaphor.getEnd()) #update the head2text dict if text not in head2text[head]: head2text[head].append(text) #make sure the head nouns are reasonable #print "{0} => {1}".format(text, head) #then look for thangs if text not in list(nouns.keys()): nouns[text] = VirtualPronoun(text) nouns[text].updateDocs(doc.getName()) else: nouns[text].updateCount() nouns[text].updateDocs(doc.getName()) if anaphor_np["GRAMMAR"] == "SUBJECT": nouns[text].subj += 1 elif anaphor_np["GRAMMAR"] == "OBJECT": nouns[text].dobj += 1 #begin modifier code definite = "the {0}".format(head) indefinite1 = "a {0}".format(head) indefinite2 = "an {0}".format(head) #pos = reconcile.getPOS(doc.getName()) #head_index = specificity_utils.getHeadIndex(anaphor_np, head) #np_pos = pos.getSubset(anaphor.getStart(), anaphor.getEnd()) #np_words = text.split() if text.startswith(definite): nouns[text].bare_definite += 1 #elif text.startswith(indefinite1) or text.startswith(indefinite2): #nouns[text].indefinite += 1 #else: ##NOTE: just checking to see if there is some kind of modification now #if len(np_pos) == len(np_words): ##sys.stderr.write("Mismatch tag and word length: {0} => {1}\n".format(np_pos.getList(), np_words)) #for i in range(0, head_index): #if np_pos[i]["TAG"] == "DT": #continue #elif np_pos[i]["TAG"] == "JJ": ##print "Adjective: {0}".format(np_words[i]) #nouns[text].adjective_modifiers.append(np_words[i]) #elif np_pos[i]["TAG"].startswith("N"): ##print "Noun: {0} {1}".format(np_words[i], np_pos[i]["TAG"]) #if np_pos[i]["TAG"].startswith("NNP"): #nouns[text].proper_modifiers.append(np_words[i]) #else: #nouns[text].common_modifiers.append(np_words[i]) #else: ##print "?: {0}".format(np_words[i]) #nouns[text].other_modifiers.append(np_words[i]) #if text.startswith("the "): #get parts of speech for the np: #else: ##not definite, but still modified #if len(np_pos) == len(np_words): ##sys.stderr.write("Mismatch tag and word length: {0} => {1}\n".format(np_pos.getList(), np_words)) #continue #for i in range(0, head_index): #if np_pos[i]["TAG"] == "DT": #continue #elif np_pos[i]["TAG"] == "JJ": ##print "Adjective: {0}".format(np_words[i]) #nouns[text].adjective_modifiers.append(np_words[i]) #elif np_pos[i]["TAG"].startswith("N"): ##print "Noun: {0} {1}".format(np_words[i], np_pos[i]["TAG"]) #if np_pos[i]["TAG"].startswith("NNP"): #nouns[text].proper_modifiers.append(np_words[i]) #else: #nouns[text].common_modifiers.append(np_words[i]) #else: ##print "?: {0}".format(np_words[i]) #nouns[text].other_modifiers.append(np_words[i]) #capture post modifiers #if text.find(head + " of ") > -1: #of_start = text.find(head + " of ") #of_object = text[len(head) + of_start + 3:] #nouns[text].of_attachments.append(of_object.strip()) #if text.find(head + " on ") > -1: #of_start = text.find(head + " on ") #of_object = text[len(head) + of_start + 3:] #nouns[text].on_attachments.append(of_object.strip()) #if text.find(head + " that ") > -1: #that_start = text.find(head + " that ") #that_clause = text[len(head) + that_start+5:] #nouns[text].that_attachments.append(that_clause.strip()) #if text.find(head + " with ") > -1: #that_start = text.find(head + " with ") #that_clause = text[len(head) + that_start+5:] #nouns[text].with_attachments.append(that_clause.strip()) #if text.find(head + " by ") > -1: #by_start = text.find(head + " by ") #by_object = text[len(head) + by_start+3:] #nouns[text].by_attachments.append(by_object.strip()) #if text.find(head + " which ") > -1: #which_start = text.find(head + " which ") #which_clause = text[len(head) + which_start+6:] #nouns[text].which_attachments.append(which_clause.strip()) #if len(np_pos) >= head_index+2 and len(np_words) >= head_index+2: #if np_pos[head_index+1]["TAG"] == "VBD": #nouns[text].verbed.append(np_words[head_index+1]) #if np_pos[head_index+1]["TAG"] == "VBG": #nouns[text].verbing.append(np_words[head_index+1]) #end modifier code #find which chain the anaphor is from and add the chain statistics anaphor_chain = None for chain in list(doc.gold_chains.keys()): for mention in doc.gold_chains[chain]: if anaphor == mention: anaphor_chain = chain break chain_name = "{0}:{1}".format(doc.getName(), anaphor_chain) if chain_name not in nouns[text].chains: nouns[text].chains.append(chain_name) if anaphor_chain is not None: chain_length = len(doc.gold_chains[anaphor_chain]) nouns[text].chain_size[doc.getName()] = chain_length #coverage #chain_start = doc.gold_chains[chain][0].getStart() #chain_end = doc.gold_chains[chain][-1].getEnd() #chain_size = chain_end - chain_start #chain_coverage = float(chain_size) / len(doc.text) # number of sentences touched / number of sentences covered_sentences = 0 for sent in doc.sentences: for mention in doc.gold_chains[anaphor_chain]: if sent.contains(mention): covered_sentences += 1 break chain_coverage = float(covered_sentences) / len(doc.sentences) nouns[text].chain_coverage[doc.getName()] = chain_coverage for chain in list(doc.gold_chains.keys()): if chain == anaphor_chain: continue if len(doc.gold_chains[chain]) > chain_length: break else: nouns[text].largest_chain += 1 common_only = True for mention in doc.gold_chains[anaphor_chain]: if mention == anaphor: continue mention_head = getHead(utils.textClean(mention.getText())) if mention_head not in nouns[text].all_entities: nouns[text].all_entities.append(mention_head) #does this chain contain proper names? mention_np = doc.nps.getAnnotBySpan(mention.getStart(), mention.getEnd()) if specificity_utils.isProper(mention_np): common_only = False if chain_name not in list(nouns[text].nom_chain_only.keys()): nouns[text].nom_chain_only[chain_name] = common_only else: sys.stderr.write("Anaphor chain not found?\n") antecedent = doc.closest_antecedent(anaphor) if antecedent is not None: #record stats sd = doc.sentence_distance(antecedent, anaphor) nouns[text].sentence_distance(sd) nouns[text].most_recent_antecedents.append( antecedent.getText().lower()) antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(), antecedent.getEnd()) if antecedent_np["GRAMMAR"] == "SUBJECT": nouns[text].subj_ante += 1 elif antecedent_np["GRAMMAR"] == "OBJECT": nouns[text].dobj_ante += 1 if antecedent.getText().lower() == anaphor.getText().lower(): nouns[text].string_matches += 1 if specificity_utils.isProper(antecedent_np): nouns[text].prp_ante += 1 elif specificity_utils.isNominal(antecedent_np): nouns[text].nom_ante += 1 elif specificity_utils.isPronoun(antecedent_np): nouns[text].pro_ante += 1 else: #this guy starts the chain nouns[text].starts_chain += 1
#sys.stdout.flush() #sys.stdout.write("\r") #prog = ProgressBar(len(files)) i = 0 for f in files: #prog.update_time(i) #sys.stdout.write("\r%s" % (str(prog))) #sys.stdout.flush() i += 1 #read in the nps nps = reconcile.getNPs(f) sentences = reconcile.getSentences(f) #see which nps correspond to these heads for np in nps: np_text = utils.textClean(np.getText()) np_head = specificity_utils.getHead(np_text) if np_head in heads: #print "{0:35} => {1}".format(np_text, np_head) head2nouns[np_head].addDoc(f) head2nouns[np_head].addText(np_text) head2nouns[np_head].count += 1 head2nouns[np_head].addDefinite(np) if np["GRAMMAR"] == "SUBJECT": head2nouns[np_head].subj += 1 elif np["GRAMMAR"] == "OBJECT": head2nouns[np_head].dobj += 1 np_sentence = getSentence(np, sentences)
def word_distance(self, ant, wd): ant_text = utils.textClean(ant.getText().lower()).strip() self.word_distances[ant_text].append(wd)
sys.stderr.flush() j += 1 this_files_faux_pronouns = [] tokens = reconcile.getTokens(f) pos = reconcile.getPOS(f) #read in all possible quasi pronouns if ACE: ace_annots = reconcile.parseGoldAnnots(f) faux_pronouns = collectACEFPs(ace_annots) else: nps = reconcile.getNPs(f) faux_pronouns = collectFPs(nps, pos) for fp in faux_pronouns: text = utils.textClean(fp.getText()).lower().strip() if ACE: head = fp["HEAD"].lower().strip() else: np_tags = pos.getSubset(fp.getStart(), fp.getEnd()) head = qp_utils.getHead2(text, np_tags).lower().strip() #NOTE select what types of modification to allow on QPs here. #select only qps and bare definites #if (head in qps) and bareDefinite(text, head): # this_files_faux_pronouns.append(fp) #all QPs regardless of modification if head in qps: this_files_faux_pronouns.append(fp)
sys.exit(0) files = [] with open(sys.argv[1], 'r') as inFile: files.extend([x for x in inFile.readlines() if not x.startswith("#")]) heads = [] for f in files: if f.startswith("#"): continue f = f.strip() #NOTE will need to get all NPs eventually, not just gold nps = reconcile.getNPs(f) for np in nps: if specificity_utils.isNominal(np) or \ specificity_utils.isPronoun(np) or \ np["DATE"] != "NONE": head = specificity_utils.getHead( utils.textClean(np.getText()).lower()) if head.find(" and ") > -1: continue if head not in heads: heads.append(head) with open("nouns.txt", 'w') as outFile: outFile.write("#" + sys.argv[1] + "\n") for head in heads: outFile.write("{0}\n".format(head))
# Creation Date : 08-27-2013 # Last Modified : Wed 28 Aug 2013 03:08:42 PM MDT # Created By : Nathan Gilbert # import sys import en from pyconcile import reconcile from pyconcile import utils import specificity_utils if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: %s <dir>" % (sys.argv[0])) sys.exit(1) #read in the nps nps = reconcile.getNPs(sys.argv[1]) for np in nps: if specificity_utils.isProper(np) or specificity_utils.isPronoun(np): continue head = specificity_utils.getHead(utils.textClean(np.getText())) print("{0} => {1}".format(np.pprint(), head)) print(en.noun.senses(head)) print(en.noun.hypernyms(head, sense=0)) print("=" * 30) #read in named entities and/or read the #fire up wordnet
head2correct = {} #heads that have a correct antecedent. head2wrong = {} #heads that have an incorrect antecedent. appears in a #chain with nothing else it is coreferent with head2none = {} #heads that were not given an antecedent. --focus on #this one first head2counts = {} outputfile = "/features.goldnps/predictions.StanfordSieve.stanfordsieve/SingleLink" for f in files: #gather all the chains that were generated by the system. gold_chains = reconcile.getGoldChains(f) response_chains = reconcile.getResponseChains(f, outputfile) nps = reconcile.getNPs(f) for np in nps: head = specificity_utils.getHead(utils.textClean( np.getText())).lower() if head in heads: #this is the number of times that a NP appeared in a doc head2counts[head] = head2counts.get(head, 0) + 1 #print "{0} : {1}".format(np.pprint(), head) #for chain in response_chains: # if len(response_chains[chain]) > 1: # for mention in response_chains[chain]: # print mention.pprint() # print #find all the gold vps that were not assigned any cluster. for chain in list(response_chains.keys()): if len(response_chains[chain]) == 1: mention = response_chains[chain][0]
def sent_distance(self, ant, sd): ant_text = utils.textClean(ant.getText().lower()).strip() self.sent_distances[ant_text].append(sd)
for f in files: if f.startswith("#"): continue #prog.update_time(i) #sys.stdout.write("\r%s" % (str(prog))) #sys.stdout.flush() i += 1 f = f.strip() doc = Document(f) gold_nps = reconcile.getNPs(f) gold_chains = reconcile.getGoldChains(f) doc.addGoldChains(gold_chains) for np in gold_nps: text = utils.textClean(np.getText().lower()).strip() if TRUE_PRONOUNS: if text in TRUE: add_stats(text, np, doc, nouns, head2text) else: if specificity_utils.isNominal(np): #head = getHead(text) #if head.endswith("%"): continue #skip percents #if head[-1].isdigit(): continue #skip numbers #if utils.isConj(head): continue #just skip these guys too add_stats(text, np, doc, nouns, head2text) #sys.stdout.write("\r \r\n") #sorted_nouns = sorted(nouns, key=operator.attrgetter('count'), reverse=True) #sorted_nouns = sorted(nouns.values(), key=operator.attrgetter('count'), reverse=True) #print sorted_nouns
#in case there were no predictions in a file try: all_pairs = reconcile.getFauxPairs(f, predictions) except: continue response_pairs = [] for pair in all_pairs: if pair[0] is None or pair[1] is None: continue response_pairs.append(pair) for pair in response_pairs: ana_head = specificity_utils.getHead( utils.textClean(pair[1].getText())).lower().strip() #skip real pronouns if ana_head not in data.ALL_PRONOUNS: pairs.append(pair) if ana_head not in list(heads2nouns.keys()): heads2nouns[ana_head] = Noun(ana_head) else: heads2nouns[ana_head].updateCount() labeled_annots = reconcile.labelCorrectPairs(gold_chains, pairs) for pair in labeled_annots: total_scores["vps_guessed"] += 1 ana_head = specificity_utils.getHead( utils.textClean(pair[1].getText())).lower().strip() if pair[2]: #correct pair
break outdir = "{0}/{1}/{2}".format(f, features_dir, predictions_dir) if not os.path.exists(outdir): os.makedirs(outdir) #output in some format easy for counting the accuracy. with open(outdir + "/faux.predictions", 'w') as outfile: outfile.write("#" + f + "\n") for res in resolutions: antecedent = res[0] anaphor = res[1] sem = res[2] outfile.write("{0},{1}\t{2},{3}\t{4} <- {5}\tSEM:{6}\n".format( antecedent.getStart(), antecedent.getEnd(), anaphor.getStart(), anaphor.getEnd(), utils.textClean(antecedent.getText()), utils.textClean(anaphor.getText()), sem)) #output in some format easy for counting the accuracy. with open(outdir + "/pronoun.predictions", 'w') as outfile: outfile.write("#" + f + "\n") for res in pronoun_resolutions: antecedent = res[0] anaphor = res[1] outfile.write("{0},{1}\t{2},{3}\t{4} <- {5}\n".format( antecedent.getStart(), antecedent.getEnd(), anaphor.getStart(), anaphor.getEnd(), utils.textClean(antecedent.getText()), utils.textClean(anaphor.getText())))
def process(f, head2qp, annotated_file): stanford_deps = reconcile.getStanfordDep(f) pos = reconcile.getPOS(f) if annotated_file: nps = reconcile.getNPs(f) for np in nps: head = None text = None if PRONOUNS: if qp_utils.isPronoun(np): head = np.getText().lower() text = np.getText() else: continue else: if qp_utils.isNominal(np, pos): text = utils.textClean(np.getText()) np_tags = pos.getSubset(np.getStart(), np.getEnd()) head = utils.textClean( qp_utils.getHead2(text.lower(), np_tags)) else: continue #bookkeeping if head not in list(head2qp.keys()): head2qp[head] = QuasiPronoun(head) head2qp[head].updateCount(True) head2qp[head].updateDocs(f, True) else: head2qp[head].updateDocs(f, True) head2qp[head].updateCount(True) if USE_GOLD: gold_chains = reconcile.getGoldChains(f) process_gold(f, np, head, text, head2qp, gold_chains) process_syntax(f, np, head, text, head2qp, stanford_deps) else: stanford_nps = reconcile.getStanfordNPs(f) for np in stanford_nps: if PRONOUNS: if np["is_pronoun"]: head = np.getText().lower() text = np.getText() else: continue else: #skip some problems with the parser or numbers if np["HEAD"].startswith("$") or np["HEAD"].endswith( "%") or np["HEAD"] == ".": continue if np["is_nominal"]: text = utils.textClean(np.getText()) head = np["HEAD"].lower() else: continue #bookkeeping if head not in list(head2qp.keys()): head2qp[head] = QuasiPronoun(head) head2qp[head].updateDocs(f, False) head2qp[head].updateCount(False) else: head2qp[head].updateDocs(f, False) head2qp[head].updateCount(False) process_syntax(f, np, head, text, head2qp, stanford_deps)
heads2nps = defaultdict(list) i = 0 for f in files: prog.update_time(i) i += 1 sys.stderr.write("\r%s" % (str(prog))) sys.stderr.flush() f = f.strip() nps = reconcile.getNPs(f) for np in nps: if specificity_utils.isNominal(np) or \ specificity_utils.isPronoun(np) or \ np["DATE"] != "NONE": head = specificity_utils.getHead( utils.textClean(np.getText()).lower()) if head.find(" and ") > -1: continue if head in heads: heads2nps[head].append(utils.textClean(np.getText())) sys.stderr.write("\r \r\n") for head in list(heads2nps.keys()): counts = {} definite = False for np in heads2nps[head]: if np == head: continue if np == "the " + head or np == "a " + head or np == "that " + head: definite = True continue
ana_head = specificity_utils.getHead(pair[1].getText()).lower() if ana_head in FAUX_PRONOUNS: if ana_head not in list(tracked_nouns.keys()): tracked_nouns[ana_head] = Noun(ana_head) tracked_pairs.append(pair) #label the correct or incorrect pairs labeled_faux_pairs = reconcile.labelCorrectPairs(gold_chains, tracked_pairs) for lpair in labeled_faux_pairs: ana_head = specificity_utils.getHead(lpair[1].getText()).lower() key = "{0}:{1}:{2}".format(f, lpair[1].getStart(), lpair[1].getEnd()) tracked_nouns[ana_head].instances[key] = utils.textClean(lpair[1].getText()) tracked_nouns[ana_head].antecedents[key] = utils.textClean(lpair[0].getText()) tracked_nouns[ana_head].labels[key] = lpair[2] #this is an incorrect antecedent if not lpair[2]: closest_true_antecedent = closest_antecedent(gold_chains, lpair[1]) #deals with sentence distance resp_ant_sent = getAnnotSentenceNum(sentences, lpair[1]) true_ant_sent = getAnnotSentenceNum(sentences, closest_true_antecedent) ana_sent = getAnnotSentenceNum(sentences, lpair[1]) if closest_true_antecedent is not None: true_dist = ana_sent - true_ant_sent