def processACE(f, np, heads2qp): ace_annots = reconcile.parseGoldAnnots(f) stanford_deps = reconcile.getStanfordDep(f) gold_chains = reconcile.getGoldChains(f) ace_np = ace_annots.getAnnotBySpan(np.getStart(), np.getEnd()) if ace_np["is_nominal"]: head = utils.textClean(ace_np["HEAD"].strip().lower()) text = utils.textClean(np.getText()) #bookkeeping if head not in list(heads2qp.keys()): heads2qp[head] = QuasiPronoun(head) else: heads2qp[head].updateDocs(f) heads2qp[head].updateCount() if ace_np["GOLD_SINGLETON"]: heads2qp[head].singelton += 1 else: #does it start the chain? for gc in list(gold_chains.keys()): if gold_chains[gc][0] == np: heads2qp[head].starts_chain += 1 break process(f, np, head, text, heads2qp, stanford_deps)
def OnFileOpen(self, e): """ File|Open event - Open dialog box. """ dlg = wx.FileDialog(self, "Open", self.dirName, self.fileName, "Text Files (*.txt)|*.txt|All Files|*.*", wx.OPEN) if (dlg.ShowModal() == wx.ID_OK): self.fileName = dlg.GetFilename() self.dirName = dlg.GetDirectory() f = file(os.path.join(self.dirName, self.fileName), 'r') self.fullText = ''.join(f.readlines()) self.text_box_left.SetValue(self.fullText) f.close() dlg.Destroy() self.gold_chains = reconcile.getGoldChains(self.dirName)
def processACE(f, head2qp): global USE_GOLD ace_annots = reconcile.parseGoldAnnots(f) nps = reconcile.getNPs(f) stanford_deps = reconcile.getStanfordDep(f) gold_chains = reconcile.getGoldChains(f) for np in nps: ace_np = ace_annots.getAnnotBySpan(np.getStart(), np.getEnd()) head = None text = None if PRONOUNS: if qp_utils.isPronoun(np): head = ace_np["HEAD"].lower() text = np.getText() else: continue else: if ace_np["is_nominal"]: head = utils.textClean(ace_np["HEAD"].strip().lower()) text = utils.textClean(np.getText()) else: continue #bookkeeping if head not in list(head2qp.keys()): head2qp[head] = QuasiPronoun(head) else: head2qp[head].updateDocs(f) head2qp[head].updateCount() if ace_np["GOLD_SINGLETON"]: head2qp[head].singleton += 1 if (text.startswith("a ") or text.startswith("an ")): head2qp[head].faux_ba += 1 else: #does it start the chain? if USE_GOLD: process_gold(f, np, head, text, head2qp, gold_chains) process_syntax(f, np, head, text, head2qp, stanford_deps)
self.gold_buddies = [] self.buddies = [] def __str__(self): s = "{0} : [{1}]".format(self.text, ", ".join(self.gold_buddies)) return s if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: %s <gold-document> <unannotated-documents-list>" % (sys.argv[0])) sys.exit(1) #read in gold chains gold_chains = reconcile.getGoldChains(sys.argv[1]) gold_chain_text = defaultdict(list) pos_tags = reconcile.getPOS(sys.argv[1]) tokens = reconcile.getTokens(sys.argv[1]) for key in list(gold_chains.keys()): for mention in gold_chains[key]: if mention.pprint() not in [ x.pprint() for x in gold_chain_text[key] ]: #np_pos = [x.getATTR("TAG") for x in \ # pos_tags.getOverlapping(mention)] #np_tok = [x.getText() for x in \ # tokens.getOverlapping(mention)] #mention.setProp("TAGS", np_pos)
i = 0 prog = ProgressBar(len(files)) correct_qps = {} incorrect_qps = {} for f in files: if f.startswith("#"): continue f = f.strip() prog.update_time(i) sys.stderr.write("\r%s" % (str(prog))) sys.stderr.flush() i += 1 nps = reconcile.getNPs(f) gold_chains = reconcile.getGoldChains(f) try: all_pairs = reconcile.getFauxPairs(f, PREDICTIONS) except: continue response_pairs = [] for pair in all_pairs: if pair[0] is None or pair[1] is None: continue response_pairs.append(pair) labeled_annots = reconcile.labelCorrectPairs(gold_chains, response_pairs) for pair in labeled_annots: if ACE:
dest="duncan", type="string", default="") parser.add_option("-v", help="Verbose. Be it.", action="store_true", dest="verbose", default=False) (options, args) = parser.parse_args() if len(sys.argv) < 3: parser.print_help() sys.exit(1) if options.directory is not None: gold_chains = reconcile.getGoldChains(options.directory) duncan_pairs = duncan.getDuncanPairs(options.directory) accuracy = score.accuracy(gold_chains, duncan_pairs) print("A: %d/%d = %0.2f" % (accuracy[0], accuracy[1], accuracy[2])) elif options.filelist is not None: filelist = open(options.filelist, 'r') total = [0, 0] h_stats_correct = {} h_stats_total = {} for f in filelist: f = f.strip() if f.startswith("#"): continue gold_chains = reconcile.getGoldChains(f) duncan_pairs = duncan.getDuncanPairs(f) accuracy = score.accuracy(gold_chains, duncan_pairs)
antecedent.getATTR("semantic"), antecedent.getATTR("gender"), antecedent.getATTR("number")) attrs2 = "sem=%s, gen=%s, num=%s" % (anaphor.getATTR("semantic"), anaphor.getATTR("gender"), anaphor.getATTR("number")) if byte not in tmp: tmp.append(byte) print("%s [%s] <- %s [%s] (H:%s)" % (antecedent.ppprint(), attrs1, anaphor.ppprint(), attrs2, h)) if options.vverbose: print("===================") if options.evaluate: GoldChains = reconcile.getGoldChains(options.directory) s = score.accuracy(GoldChains, all_pairs) print("Document Score:") print(" Accuracy: %0.2f with %d Correct, %d Incorrect" % (s[0], s[1], s[2])) if options.stats: GoldChains = reconcile.getGoldChains(options.directory) s = score.accuracy(GoldChains, all_pairs) print("%d %d %d" % (s[1], s[2], s[3])) if options.write: heurs = defaultdict(list) UniquePairs = [] for i in range(0, len(all_pairs)): curr = all_pairs[i]
# import sys import pydot from pyconcile import reconcile if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: %s <response-file>" % (sys.argv[0])) sys.exit(1) dataDir = sys.argv[1][:sys.argv[1].find("/")] responseFile = sys.argv[1][sys.argv[1].find("/"):] clusterer="SingleLink" sentences = reconcile.getSentences(dataDir) gold_chains = reconcile.getGoldChains(dataDir) #get reconcile's edges response_chains = reconcile.getResponseChains(dataDir, responseFile+"/"+clusterer) response_pairs = reconcile.getResponsePairs(dataDir, responseFile, 0.5) response_pairs = reconcile.labelCorrectPairs(gold_chains, response_pairs) #pydot graph graph = pydot.Dot("reconcile_clusters", graph_type='digraph') #add in all the NP #NOTE: as long as we are working with gold mentions, the response and gold #will match. otherwise, will need to switch over to gold nps to see proper #'misses'
def process(f, head2qp, annotated_file): stanford_deps = reconcile.getStanfordDep(f) pos = reconcile.getPOS(f) if annotated_file: nps = reconcile.getNPs(f) for np in nps: head = None text = None if PRONOUNS: if qp_utils.isPronoun(np): head = np.getText().lower() text = np.getText() else: continue else: if qp_utils.isNominal(np, pos): text = utils.textClean(np.getText()) np_tags = pos.getSubset(np.getStart(), np.getEnd()) head = utils.textClean( qp_utils.getHead2(text.lower(), np_tags)) else: continue #bookkeeping if head not in list(head2qp.keys()): head2qp[head] = QuasiPronoun(head) head2qp[head].updateCount(True) head2qp[head].updateDocs(f, True) else: head2qp[head].updateDocs(f, True) head2qp[head].updateCount(True) if USE_GOLD: gold_chains = reconcile.getGoldChains(f) process_gold(f, np, head, text, head2qp, gold_chains) process_syntax(f, np, head, text, head2qp, stanford_deps) else: stanford_nps = reconcile.getStanfordNPs(f) for np in stanford_nps: if PRONOUNS: if np["is_pronoun"]: head = np.getText().lower() text = np.getText() else: continue else: #skip some problems with the parser or numbers if np["HEAD"].startswith("$") or np["HEAD"].endswith( "%") or np["HEAD"] == ".": continue if np["is_nominal"]: text = utils.textClean(np.getText()) head = np["HEAD"].lower() else: continue #bookkeeping if head not in list(head2qp.keys()): head2qp[head] = QuasiPronoun(head) head2qp[head].updateDocs(f, False) head2qp[head].updateCount(False) else: head2qp[head].updateDocs(f, False) head2qp[head].updateCount(False) process_syntax(f, np, head, text, head2qp, stanford_deps)
def gold_annotations(f): """process the file with gold annotations""" global virtual_pronouns, total_counts, virtual_pronoun_heads, \ nominal_base_antecedent, distance_from_antecedent doc = Document(f) gold_chains = reconcile.getGoldChains(f) #adding in Sundance nes. nes = reconcile.getNEs(f, True) add_reconcile_semantic_class(gold_chains, nes) #adding in Reconcile pos too. pos = reconcile.getPOS(f, True) #getting the docs nps reconcile_nps = reconcile.getNPs_annots(f) #getting sundance nps sundance_nps = reconcile.getSundanceNPs(f) add_sundance_nps(gold_chains, sundance_nps) original_text_heads = {} # just getting the heads original_text = defaultdict(list) # for getting total doc counts later. nominal2chains = defaultdict( list) # the chains that a given nominal appears. for chain in list(gold_chains.keys()): base_antecedent = True prev_annot = None antecedents = 0 for mention in gold_chains[chain]: #if the first antecedent in a chain, do not list it as anaphoric. if base_antecedent: if mention.getATTR("is_nominal") and not \ mention.getATTR("GOLD_SINGLETON"): text = mention.getText() text_lower = mention.getATTR("TEXT_CLEAN").lower() docs_appeared[text_lower].append(f) nominal_base_antecedent[text_lower] = \ nominal_base_antecedent.get(text_lower, 0) + 1 original_text[text_lower].append(text) #take note that this chain contained this nominal nominal2chains[text_lower].append(chain) #take note of the gold semantic class gold_semantic_class[text_lower].append( mention.getATTR("GOLD_SEMANTIC")) #reconcile's semantic class reconcile_semantic_class[text_lower].append( mention.getATTR("NE_CLASS")) #sundance's semantic class sun_semantic_class[text_lower].append( mention.getATTR("SUN_SEMANTIC")) number_gold_antecedents[text_lower].append(antecedents) #get verb stats if mention.getATTR("ROLE") == "SUBJ": verb = reconcile.getSubjVerb(mention, pos) if verb != None: subj_verbs[text_lower].append(verb.lower()) elif mention.getATTR("ROLE") == "DOBJ": verb = reconcile.getObjVerb(mention, pos) if verb != None: obj_verbs[text_lower].append(verb.lower()) base_antecedent = False prev_annot = mention antecedents += 1 continue if mention.getATTR("is_nominal"): text = mention.getText() text_lower = mention.getATTR("TEXT_CLEAN").lower() head_text = mention.getATTR("HEAD_TEXT") original_text[text_lower].append(text) virtual_pronouns[text_lower] = \ virtual_pronouns.get(text_lower, 0) + 1 virtual_pronoun_heads[head_text.lower()] = \ virtual_pronoun_heads.get(head_text.lower(), 0) + 1 #the semantic class Reconcile puts this in. reconcile_semantic_class[text_lower].append( mention.getATTR("NE_CLASS")) #register this doc as containing this np. docs_appeared[text_lower].append(f) #take note that this chain contained this nominal nominal2chains[text_lower].append(chain) #take note of the gold semantic class gold_semantic_class[text_lower].append( mention.getATTR("GOLD_SEMANTIC")) #the number of possible correct antecedents for this anaphor number_gold_antecedents[text_lower].append(antecedents) #sundance's semantic class sun_semantic_class[text_lower].append( mention.getATTR("SUN_SEMANTIC")) # subject verb statistics if mention.getATTR("ROLE") == "SUBJ": verb = reconcile.getSubjVerb(mention, pos) subj_verbs[text_lower].append(verb.lower()) elif mention.getATTR("ROLE") == "DOBJ": verb = reconcile.getObjVerb(mention, pos) obj_verbs[text_lower].append(verb.lower()) #get the sentence distance from these two mentions. mention_sent = reconcile.getAnnotSentence(f, mention) prev_sent = reconcile.getAnnotSentence(f, prev_annot) if mention_sent > -1 and prev_sent > -1: distance_from_antecedent[text_lower].append(mention_sent - \ prev_sent) #get the TextTiling segment distance for the two mentions mention_seg = doc.getAnnotTile(mention) prev_seg = doc.getAnnotTile(prev_annot) if mention_seg > -1 and prev_seg > -1: focus_distance[text_lower].append(mention_seg - \ prev_seg) #getting the distribution of closest antecedent types for a #given nominal if prev_annot.getATTR("is_nominal"): nominals2type[text_lower]["nominal"] = \ nominals2type[text_lower].get("nominal",0) + 1 elif prev_annot.getATTR("is_pronoun"): nominals2type[text_lower]["pronoun"] = \ nominals2type[text_lower].get("pronoun",0) + 1 else: nominals2type[text_lower]["proper"] = \ nominals2type[text_lower].get("proper",0) + 1 prev_annot = mention antecedents += 1 #for key in nominal2chains.keys(): # print "%d : %s (doc: %s)" % (len(list(set(nominal2chains[key]))), key, # doc) #update the total counts. for key in list(original_text.keys()): for text in list(set(original_text[key])): total_counts[key] = total_counts.get(key, 0) + doc.getWordCounts(text) #the head counts for key in list(virtual_pronoun_heads.keys()): total_counts_heads[key] = total_counts_heads.get(key, 0) + \ doc.getWordCounts(key)