def _add_tlinks_to_fragment(self, in_fragment, tmp_fragment, out_fragment): """Take the links from the merged tlinks and add them into the fragment. Based on the method with the same name in the classifier wrapper.""" xmldoc1 = Parser().parse_file(open(in_fragment,'r')) xmldoc2 = Parser().parse_file(open(tmp_fragment,'r')) xmldoc1.remove_tags(TLINK) for tlink in xmldoc2.get_tags(TLINK): reltype = tlink.attrs[RELTYPE] id1 = tlink.attrs.get(EVENT_INSTANCE_ID, None) if not id1: id1 = tlink.attrs.get(TIME_ID, None) if not id1: logger.warn("Could not find id1 in " + tlink.content) id2 = tlink.attrs.get(RELATED_TO_EVENT_INSTANCE, None) if not id2: id2 = tlink.attrs.get(RELATED_TO_TIME, None) if not id2: logger.warn("Could not find id2 in " + tlink.content) #origin = CLASSIFIER + ' ' + tlink.attrs.get(CONFIDENCE,'') origin = tlink.attrs.get('origin','') xmldoc1.add_tlink(reltype, id1, id2, origin) xmldoc1.save_to_file(out_fragment)
def _add_tlinks_to_fragment(self, in_fragment, tmp_fragment, out_fragment): """Takes the links created by the classifier and merges them into the input fragment.""" xmldoc1 = Parser().parse_file(open(in_fragment, 'r')) xmldoc2 = Parser().parse_file(open(tmp_fragment, 'r')) for tlink in xmldoc2.get_tags(TLINK): reltype = tlink.attrs[RELTYPE] id1 = tlink.attrs.get(EVENT_INSTANCE_ID, None) if not id1: id1 = tlink.attrs.get(TIME_ID, None) if not id1: logger.warn("Could not find id1 in " + tlink.content) id2 = tlink.attrs.get(RELATED_TO_EVENT_INSTANCE, None) if not id2: id2 = tlink.attrs.get(RELATED_TO_TIME, None) if not id2: logger.warn("Could not find id2 in " + tlink.content) origin = CLASSIFIER + ' ' + tlink.attrs.get(CONFIDENCE, '') xmldoc1.add_tlink(reltype, id1, id2, origin) xmldoc1.save_to_file(out_fragment)
def _add_tlinks_to_fragment(self, in_fragment, tmp_fragment, out_fragment): """Takes the links created by the classifier and merges them into the input fragment.""" xmldoc1 = Parser().parse_file(open(in_fragment,'r')) xmldoc2 = Parser().parse_file(open(tmp_fragment,'r')) for tlink in xmldoc2.get_tags(TLINK): reltype = tlink.attrs[RELTYPE] id1 = tlink.attrs.get(EVENT_INSTANCE_ID, None) if not id1: id1 = tlink.attrs.get(TIME_ID, None) if not id1: logger.warn("Could not find id1 in " + tlink.content) id2 = tlink.attrs.get(RELATED_TO_EVENT_INSTANCE, None) if not id2: id2 = tlink.attrs.get(RELATED_TO_TIME, None) if not id2: logger.warn("Could not find id2 in " + tlink.content) origin = CLASSIFIER + ' ' + tlink.attrs.get(CONFIDENCE,'') xmldoc1.add_tlink(reltype, id1, id2, origin) xmldoc1.save_to_file(out_fragment)
class HtmlGenerator: """An HtmlGenerator is created for an XML file with TimeML tags. It is used to create HTML files that display the text with events and times highlighted and links lined up with the sentences. It can also be used to create tables for events, times and links. Instance variables: file_name - an absolute path xmldoc - an XmlDocument sentences - a list instances - a mapping from eiid and eid to XmlDocElements that contain an instance links - a mapping from eids and tids to links that contain them""" def __init__(self, file_name): self.file_name = file_name file = open(file_name, 'r') self.xmldoc = Parser().parse_file(file) self.sentences = [] self.instance_idx = {} self.event_idx = {} self.timex_idx = {} self.events = [] self.timexes = [] self.links = {} self._init_sentence_list() self._init_indexes() self._init_events_list() self._init_timexes_list() self._init_link_index() def _init_sentence_list(self): """Fill in the self.sentences list. Sentences do not include their opening and closing tags.""" for open_s_tag in self.xmldoc.get_tags('s'): close_s_tag = open_s_tag.get_closing_tag() elements = open_s_tag.get_slice_till(close_s_tag.id) self.sentences.append(Sentence(elements)) def _init_events_list(self): for event in self.xmldoc.get_tags('EVENT'): eid = event.attrs['eid'] instance = self.instance_idx[eid] self.events.append(Event(event, instance)) def _init_timexes_list(self): self.timexes = self.xmldoc.get_tags('TIMEX3') def _init_indexes(self): """The value for these indixes are always XML elements.""" for instance in self.xmldoc.get_tags('MAKEINSTANCE'): eiid = instance.attrs.get('eiid') eid = instance.attrs.get('eventID') self.instance_idx[eiid] = instance self.instance_idx[eid] = instance for event in self.xmldoc.get_tags('EVENT'): eid = event.attrs.get('eid') self.event_idx[eid] = event for timex in self.xmldoc.get_tags('TIMEX3'): tid = timex.attrs.get('tid') self.timex_idx[tid] = timex # need to add dct, but this needs to be done differently self.timex_idx['t0'] = create_dct_element('20080515') def _init_link_index(self): link_elements = \ self.xmldoc.get_tags('ALINK') + \ self.xmldoc.get_tags('SLINK') + \ self.xmldoc.get_tags('TLINK') for link_element in link_elements: link = Link(link_element, self.instance_idx) id = link.get_id(['eventInstanceID', 'timeID']) try: self.links[id].append(link) except KeyError: self.links[id] = [link] def create_file(self, outfile, creators): """Creates a file with all sentences and the links lined up with the sentences.""" fh = open(outfile, 'w') fh.write("<html>\n<head><style>\n" + "s {display: block; text-decoration: none}\n" + "</style>\n<body>\n" + "<table cellpadding=4" + "<tr>\n <td>source\n <td>%s\n" % self.file_name + "<tr>\n <td>components\n <td>%s\n" % ' + '.join(creators) + "</table>\n\n" + "<hr>\n\n" + "<table cellspacing=7pt>\n") for sentence in self.sentences: fh.write("\n<tr>\n <td>") sentence.print_html(fh) fh.write(" <td>") for id in sentence.get_ids(): links = self.links.get(id, []) for link in links: if link.creator in creators: fh.write(' ' + link.convert() + "<br/>\n") fh.write("\n</table>\n</body>\n</html>\n") def create_events_table(self, file): fh = open(file, 'w') fh.write("<html>\n<body>\n<table cellpadding=4>\n" + "<tr align=\"left\">\n" + " <th bgcolor=\"#dddddd\">event\n" + " <th bgcolor=\"#dddddd\">pos\n" + " <th bgcolor=\"#dddddd\">class\n" + " <th bgcolor=\"#dddddd\">tense\n" + " <th bgcolor=\"#dddddd\">aspect\n" + " <th bgcolor=\"#dddddd\">polarity\n" + " <th bgcolor=\"#dddddd\">modality\n" + "</tr>\n") celltag = '<td bgcolor="#dddddd">' for event in self.events: fh.write("<tr>\n") eid = event.attrs['eid'] text = event.attrs['text'] pos = event.attrs['pos'].lower() eclass = event.attrs['class'].lower() tense = event.attrs['tense'].lower() aspect = event.attrs['aspect'].lower() polarity = event.attrs['polarity'].lower() modality = event.attrs.get('modality', '').lower() if tense == 'none': tense = '' if aspect == 'none': aspect = '' if polarity == 'pos': polarity = '' if aspect == 'perfective_progressive': aspect = 'perf_prog' fh.write(" %s<font color=red>%s_%s</font>\n" % (celltag, text, eid)) fh.write(" %s%s\n" % (celltag, pos)) fh.write(" %s%s\n" % (celltag, eclass)) fh.write(" %s%s\n" % (celltag, tense)) fh.write(" %s%s\n" % (celltag, aspect)) fh.write(" %s%s\n" % (celltag, polarity)) fh.write(" %s%s\n" % (celltag, modality)) fh.write("</table>\n</body>\n<html>\n") def create_timexes_table(self, file): fh = open(file, 'w') fh.write("<html>\n<body>\n<table cellpadding=4>\n" + "<tr align=\"left\">\n" + " <th bgcolor=\"#dddddd\">timex\n" + " <th bgcolor=\"#dddddd\">type\n" + " <th bgcolor=\"#dddddd\">value\n" + "</tr>\n") celltag = '<td bgcolor="#dddddd">' for timex in self.timexes: fh.write("<tr>\n") tid = timex.attrs['tid'] text = timex.collect_content() type = timex.attrs['TYPE'].lower() value = timex.attrs.get('VAL') fh.write(" %s<font color=blue>%s_%s</font>\n" % (celltag, text, tid)) fh.write(" %s%s\n" % (celltag, type)) fh.write(" %s%s\n" % (celltag, str(value))) fh.write("</table>\n</body>\n<html>\n") def create_links_table(self, file): fh = open(file, 'w') fh.write("<html>\n<body>\n<table cellpadding=4>\n") celltag = '<td bgcolor="#dddddd">' for sentence in self.sentences: for id in sentence.get_ids(): for link in self.links.get(id, []): id1 = link.get_id1() id2 = link.get_id2() text1 = self._get_text(id1) text2 = self._get_text(id2) full_text1 = color_text(text1, id1) full_text2 = color_text(text2, id2) origin = link.attrs.get('origin') if not origin: origin = link.attrs.get('syntax') fh.write("<tr>\n") fh.write(" %s%s\n" % (celltag, full_text1)) fh.write(" %s%s\n" % (celltag, link.attrs['relType'])) fh.write(" %s%s\n" % (celltag, full_text2)) fh.write(" %s%s\n" % (celltag, origin)) fh.write("</tr>\n") fh.write("</table>\n</body>\n<html>\n") def _get_text(self, id): # this is a bit of a hack and needs to be solved elsewhere # the DCT has no tag and the cod eto collect data crashes if id == 't0': return 'DCT' if id.startswith('t'): timex = self.timex_idx[id] return timex.collect_content() if id.startswith('e'): instance = self.instance_idx[id] eid = instance.attrs['eventID'] event = self.event_idx[eid] return event.collect_content()
def tlink_inject_with_prior_with_check(no_tlink_file, result_file, tlink_file, original_file): """ """ """ Verb event should be a dictionary to map between an event id and some other lemmas generated from the initial lemma. """ verb_events = {} """ EVENT tag sample <EVENT class="OCCURRENCE" eid="e1000028"> """ xml_document = Parser().parse_file(open(no_tlink_file, "r")) xmldoc_original = Parser().parse_file(open(original_file, "r")) for element in xml_document.get_tags(EVENT): if element.is_opening_tag(): eid = element.attrs[EID] event_content = element.next.content synsets_event = None if len(wn.synsets(event_content, 'v')) > 0: synsets_event = wn.synsets(element.next.content, 'v')[0].lemma_names verb_morphy = wn.morphy(event_content, 'v') verb_events[eid] = { MORPHY_LEMMA: verb_morphy, SYNSET_LEMMA: synsets_event } """ <MAKEINSTANCE eventID="e2" polarity="POS" pos="VERB" eiid="ei2" tense="PRESENT" aspect="PERFECTIVE"> """ verb_event_instance = {} for element in xml_document.get_tags(INSTANCE): if element.is_opening_tag(): eiid = element.attrs[EIID] eid = element.attrs[EVENTID] if eid in verb_events: verb_event_instance[eiid] = verb_events[eid] """ All TLINKs in the original document between two events Because excepts the TLINKs parts, original and classified documents should be identical, so they could use the same verb_event_instance. """ original_ee_tlinks = {} for element in xmldoc_original.get_tags(TLINK): # keep track of event order here if element.is_opening_tag(): lid = element.attrs[LID] if EVENT_INSTANCE_ID in element.attrs: eiid = element.attrs[EVENT_INSTANCE_ID] if RELATED_TO_EVENT_INSTANCE in element.attrs: reiid = element.attrs[RELATED_TO_EVENT_INSTANCE] if RELTYPE in element.attrs: if eiid in verb_event_instance and reiid in verb_event_instance: original_ee_tlinks[(eiid, reiid)] = ( lid, element.attrs[RELTYPE]) with open(result_file, 'r') as result_file: label_vote_dict = json.load(result_file) fix_label_counter = 0 worsen_label_counter = 0 for feature_type in label_vote_dict: for line_counter in label_vote_dict[feature_type]: result_dict = label_vote_dict[feature_type][line_counter][ RESULT_DICT] label_vote = label_vote_dict[feature_type][line_counter][VOTE_DICT] ids = label_vote_dict[feature_type][line_counter][TLINK_IDS_DICT] raw_relType = label_vote[-1][0] """ Have to re calculate the relType here - Calculate P ( label | lemma_pair, result_vector ) ~ P(label) x P ( result_vector | label ) x P ( lemma_pair | label ) """ # if raw_relType == NORELATION or raw_relType == SIMULTANEOUS: if raw_relType == NORELATION: pass else: def check_event_pair(ids): for id in ids: if id[1] == TID: return False return True """ If the relation is between event pairs, we check the narrative scheme, else, just use the raw_relType for TLink between time and event. """ new_ids = {} for id in ids: if id[1] in [TID, EIID]: new_ids[id[0]] = id[2] original_relation = None if (new_ids['0'], new_ids['1']) in original_ee_tlinks: original_relation = original_ee_tlinks[(new_ids['0'], new_ids['1'])][1] elif (new_ids['1'], new_ids['0']) in original_ee_tlinks: original_relation = reverse( original_ee_tlinks[(new_ids['1'], new_ids['0'])][1]) """ Eleventh try Only consider main event pairs inter sentences """ if check_event_pair(ids): probability = {} max_label = None max_prob = None """ Third approach: only consider labels inside the votes """ result_prob = {} label_prob = {} lemma_pair_prob = {} for label in [str(label[0]) for label in label_vote]: if not label in [BEFORE, AFTER, SIMULTANEOUS]: continue """ 15th try: only fix BEFORE and AFTER labels """ # if not label in [BEFORE, AFTER]: # continue # for label in [BEFORE, AFTER, SIMULTANEOUS]: probability[label] = 1 result_prob[label] = histogram.get_probability_vector( result_dict, label) probability[label] *= result_prob[label] """ First approach: only use the morphy lemma """ morphy_1 = verb_event_instance[ new_ids['0']][MORPHY_LEMMA] morphy_2 = verb_event_instance[ new_ids['1']][MORPHY_LEMMA] # lemma_pair_prob[label] = crd.get_lemma_pair_prob((morphy_1,morphy_2,label)) # lemma_pair_prob[label] = crd.get_lemma_pair_prob_smoothing((morphy_1,morphy_2),label) """ Tenth approach: desperate try, multiply all of them together """ # probability[label] *= lemma_pair_prob[label] """ Done first approach """ """ Second approach: use all pairs of lemmas with lemma in corresponding two synsets """ lemma_pair_prob[label] = 0 synset_1 = verb_event_instance[ new_ids['0']][SYNSET_LEMMA] synset_2 = verb_event_instance[ new_ids['1']][SYNSET_LEMMA] if synset_1 != None and synset_2 != None: for l_1, l_2 in itertools.product( synset_1, synset_2): lemma_pair_prob[ label] += crd.get_lemma_pair_prob_smoothing( (l_1, l_2), label) # lemma_pair_prob[label] += crd.get_lemma_pair_prob((l_1,l_2),label) """ Done second approach """ """ Seventh try: turn off lemma pairs """ probability[label] *= lemma_pair_prob[label] label_prob[label] = histogram.get_probability_label( label) # """ # 14th try: normalize BEFORE and AFTER labels # """ # if label == BEFORE or label == AFTER: # label_prob[label] = (histogram.get_probability_label (BEFORE) # + histogram.get_probability_label (AFTER))/2 """ 13 rd try: disable label prob """ probability[label] *= label_prob[label] if max_prob == None or max_prob < probability[label]: max_prob = probability[label] max_label = label """ Forth try: if max_prob == 0, it means that all probabilities = 0 and we should follow the initialy vote """ if max_prob == 0: relType = raw_relType else: relType = max_label need_to_keep_track = False if (relType == raw_relType and original_relation != None and original_relation != relType and original_relation in [BEFORE, AFTER, SIMULTANEOUS]): need_to_keep_track = True logging.info( '---------------DOESNT HELP----------------') if relType != raw_relType and original_relation != None: need_to_keep_track = True if (original_relation == relType): fix_label_counter += 1 if (original_relation == raw_relType): worsen_label_counter += 1 logging.info( '---------------MAKE CHANGE----------------') if need_to_keep_track: logging.info('Correct relation : %s' % original_relation) logging.info('Original classified : %s' % raw_relType) logging.info('Prior classified : %s' % relType) logging.info(morphy_1) logging.info(morphy_2) logging.info(synset_1) logging.info(synset_2) logging.info('--result_prob--') logging.info(result_prob) logging.info('--label_prob--') logging.info(label_prob) logging.info('--lemma_pair_prob--') logging.info(lemma_pair_prob) logging.info(probability) logging.info('==============================') else: relType = raw_relType xml_document.add_tlink(relType, new_ids['0'], new_ids['1'], SVM_CLASSIFIER_ORIGIN) xml_document.save_to_file(tlink_file) return (fix_label_counter, worsen_label_counter)
class Blinker (TarsqiComponent): """Implements the Blinker component of Tarsqi. Blinker takes the shallow tree implemented in the Document object and applies rules that capture regularities between events and times as well as between events. Instance variables: NAME - a string rules - a BlinkerRuleDictionary rule2_index - a dictionary, quick access to type 2 rules dct - a string of the form YYYYMMDD, representing the document creation time xmldoc - an XmlDocument, created by xml_parser.Parser doctree - a Document, created by converter.FragmentConverter""" def __init__(self): """Set component name and load rules into a BlinkerRuleDictionary object, this object knows where the rules are stored.""" self.NAME = BLINKER self.rules = BlinkerRuleDictionary() self.rule2_index = {} #self.rules.pp_ruletype(2) self._populate_rule2_index() def _populate_rule2_index(self): """Rules of type 2 (timex-signal-event) can be simply put in a hash keyed on the signals.""" for rule in self.rules[2]: relation = rule.get_attribute('relation')[0] # vals are now lists signal = rule.get_attribute('signal')[0] self.rule2_index[signal] = relation def process(self, infile, outfile, dct): """Apply all Blinker rules to the input file. Parses the xml file with xml_parser.Parser and converts it to a shallow tree with converter.FragmentConverter. Then applies the Blinker rules. Curently only applies rules of type 2. Arguments infile - an absolute path outfile - an absolute path No return value.""" xmlfile = open(infile, "r") self.dct = dct self.xmldoc = Parser().parse_file(xmlfile) self.doctree = FragmentConverter(self.xmldoc, infile).convert(user=BLINKER) #self.print_doctree(BLINKER) self._run_blinker() self.xmldoc.save_to_file(outfile) def _run_blinker(self): """Apply BLinker rules to the sentences in the doctree variable. Currently only deals with rule type 2, anchoring an event to a timex in those cases where there is a signal (that is, a preposition) available. New Tlinks are added just before the closing tag of the fragment.""" self._run_timex_linking() self._apply_event_ordering_with_signal_rules() # variables needed for different rule types are prefixed with r<ruleNum> r3_event1 = None # iterate over sentences for si in range(len(self.doctree)): sentence = self.doctree[si] r3_main_event = None if _DEBUG5: print "processing sentence", si # iterate over elements within a sentence for i in range(len(sentence)): element = sentence[i] timex = element.get_timex() event = element.get_event() # RULE TYPE 2 if timex: # chunk contains a timex, now try to anchor events to it self._apply_event_anchoring_rules(sentence, timex, i) # RULE TYPE 3 if event and element.isChunk() and element.isVerbChunk(): # the first verb event in a sentence is considered the main event if not r3_main_event: r3_main_event = event # if previous sentence contained an event, create a link if r3_event1: r3_event2 = r3_main_event self._apply_type3_rules(r3_event1, r3_event2) r3_event1 = r3_event2 # else set event1 else: r3_event1 = r3_main_event #""" # RULE TYPE 5 if event and element.isChunk() \ and element.isVerbChunk() \ and event.attrs['class'] == 'REPORTING': if _DEBUG5: print "applying type 5 rules" self._apply_type5_rules(sentence, event, i) #""" # R3: if no main event in sentence if not r3_main_event: r3_event1 = None def _run_timex_linking(self): """Apply the rules that govern relations between TIMEX3 tags. Only applies to TIMEX3 tags with a VAL attribute equal to DATE.""" timexes = [timex for timex in self.xmldoc.get_tags(TIMEX) if timex.attrs['TYPE'] == 'DATE'] for t in timexes: if t.attrs.get('VAL', None) is None: logger.warn("Missing VAL: %s" % t.get_content()) for i in range(len(timexes)): for j in range(len(timexes)): if i < j: try: self._create_timex_link(timexes[i], timexes[j]) except Exception: logger.error("Error in Timex Linking:\n%s\n%s" % \ (timexes[i].get_content(), timexes[j].get_content())) def _create_timex_link(self, timex1, timex2): """Try to create a TLINK between two timexes.""" creation_year = self.dct[0:4] date1 = timex1.attrs.get('VAL', None) date2 = timex2.attrs.get('VAL', None) if date1 is None or date2 is None: return date1 = fix_timex_val(date1) date2 = fix_timex_val(date2) tid1 = timex1.attrs['tid'] tid2 = timex2.attrs['tid'] comment = "Blinker - Timex Linking" if date1 == date2: if date1 not in ('PAST_REF', 'FUTURE_REF'): self.xmldoc.add_tlink('IDENTITY', tid1, tid2, comment) else: rel = compare_date(date1, date2, creation_year) if rel != 'IDENTITY': self.xmldoc.add_tlink(rel, tid1, tid2, comment) def _apply_type3_rules(self, event1, event2): """ Creates a TLINK between two main events """ if _DEBUG3: print event1.dtrs[0].getText(), event2.dtrs[0].getText() print event1.dtrs[0].getText(), event1.attrs['class'], \ event1.attrs['tense'], event1.attrs['aspect'] print event2.dtrs[0].getText(), event2.attrs['class'], \ event2.attrs['tense'], event2.attrs['aspect'] for i in range(len(self.rules[3])): rule = self.rules[3][i] if _DEBUG3: print "RULE %s:" % (rule.rule_number) print rule.attrs['arg1.class'], rule.attrs['arg1.tense'], rule.attrs['arg1.aspect'] print rule.attrs['arg2.class'], rule.attrs['arg2.tense'], rule.attrs['arg2.aspect'] # see tags.py and library.timeMLspec.py for attribute names if event1.attrs['class'] in rule.attrs['arg1.class'] and \ event2.attrs['class'] in rule.attrs['arg2.class'] and \ event1.attrs['tense'] in rule.attrs['arg1.tense'] and \ event2.attrs['tense'] in rule.attrs['arg2.tense'] and \ event1.attrs['aspect'] in rule.attrs['arg1.aspect'] and \ event2.attrs['aspect'] in rule.attrs['arg2.aspect']: rel = rule.attrs['relation'][0] self.xmldoc.add_tlink( rel, event1.attrs[EIID], event2.attrs[EIID], "Blinker - Type 3 (rule %s)" % rule.rule_number) if _DEBUG3: print "RULE %s fired!" % rule.rule_number # apply the first matching rule return def _apply_type5_rules(self, sentence, event1, position): """ Creates TLINKs between the reporting event and reported events Takes as arguments sentence, reporting event constituent, and position of that constituent within the sentence list""" # filter out rules with wrong tense applicable_rules = self.rules[5][:] applicable_rules = [rule for rule in applicable_rules if event1.attrs['tense'] in rule.attrs['arg1.tense']] # reset to opposite when quote is encountered direct = 'INDIRECT' # forward if _DEBUG5: print "inside rule application function" sentence.pretty_print() for i in range(position+1, len(sentence)): if _DEBUG5: print "processing element", i element = sentence[i] # quote if element.isToken() and element.getText() in QUOTES: if direct == 'DIRECT': direct = 'INDIRECT' if direct == 'INDIRECT': direct = 'DIRECT' # event event2 = element.get_event() if event2 and element.isChunk() and element.isVerbChunk(): current_rules = applicable_rules[:] current_rules = [rule for rule in current_rules if direct in rule.attrs['sentType']] if _DEBUG5: print event1.dtrs[0].getText(), event2.dtrs[0].getText() print event1.dtrs[0].getText(), event1.attrs['class'], \ event1.attrs['tense'], event1.attrs['aspect'] print event2.dtrs[0].getText(), event2.attrs['class'], \ event2.attrs['tense'], event2.attrs['aspect'] for rule in current_rules: # if attribute not set in the rule, accept any value for att in ['class', 'tense', 'aspect']: if not rule.attrs.has_key('arg2.'+att): rule.attrs['arg2.'+att] = [event2.attrs[att]] if _DEBUG5: print "RULE %s (%s):" % (rule.rule_number, rule.attrs['sentType'][0]) print rule.attrs['arg1.class'], rule.attrs['arg1.tense'], \ rule.attrs['arg1.aspect'] print rule.attrs['arg2.class'], rule.attrs['arg2.tense'], \ rule.attrs['arg2.aspect'] # check that specified values match if event2.attrs['class'] in rule.attrs['arg2.class'] and \ event2.attrs['tense'] in rule.attrs['arg2.tense'] and \ event2.attrs['aspect'] in rule.attrs['arg2.aspect']: rel = rule.attrs['relation'][0] self.xmldoc.add_tlink( rel, event1.attrs['eiid'], event2.attrs['eiid'], "Blinker - Type 5 (rule %s)" % rule.rule_number) if _DEBUG5: print "RULE %s fired!" % rule.rule_number # apply the first matching rule return # backward # - this creates multiple links for REPORTING to REPORTING # - may add the appropriate rules to the rule file instead direct = 'INDIRECT' for i in range(position-1, -1, -1): # ..,3,2,1,0 if _DEBUG5: print "processing element", i element = sentence[i] # quote if element.isToken() and element.getText() in QUOTES: if direct == 'DIRECT': direct = 'INDIRECT' if direct == 'INDIRECT': direct = 'DIRECT' # event event2 = element.get_event() if event2 and element.isChunk() and element.isVerbChunk(): current_rules = applicable_rules[:] current_rules = [rule for rule in current_rules if direct in rule.attrs['sentType']] if _DEBUG5: print event1.dtrs[0].getText(), event2.dtrs[0].getText() print event1.dtrs[0].getText(), event1.attrs['class'], \ event1.attrs['tense'], event1.attrs['aspect'] print event2.dtrs[0].getText(), event2.attrs['class'], \ event2.attrs['tense'], event2.attrs['aspect'] print "Applying rules for sentence type:", direct, len(current_rules), "rules" for rule in current_rules: # if attribute not set in the rule, accept any value for att in ['class', 'tense', 'aspect']: if not rule.attrs.has_key('arg2.'+att): rule.attrs['arg2.'+att] = [event2.attrs[att]] if _DEBUG5: print "RULE %s (%s):" % (rule.rule_number, rule.attrs['sentType'][0]) print rule.attrs['arg1.class'], rule.attrs['arg1.tense'], \ rule.attrs['arg1.aspect'] print rule.attrs['arg2.class'], rule.attrs['arg2.tense'], \ rule.attrs['arg2.aspect'] # check that specified values match if event2.attrs['class'] in rule.attrs['arg2.class'] and \ event2.attrs['tense'] in rule.attrs['arg2.tense'] and \ event2.attrs['aspect'] in rule.attrs['arg2.aspect']: rel = rule.attrs['relation'][0] self.xmldoc.add_tlink( rel, event1.attrs['eiid'], event2.attrs['eiid'], "Blinker - Type 5 (rule %s)" % rule.rule_number) if _DEBUG5: print "RULE %s fired!" % rule.rule_number # apply the first matching rule return def _apply_event_anchoring_rules(self, sentence, timex, i): """Anchor events to a given timex that occurs in the sentence at index i. The method proceeds by looking for some simple syntactic patterns with and without prepositions. If a pattern with a preposition occurs, then the preposition is looked up in self.rule2_index. If no signal is found, then the default INCLUDES rule will apply (rule 1), this is not yet implemented.""" # NOTES: # - Need to add some kind of confidence measures # PATTERN: [TIMEX EVENT] # Or, more precisely, an event in the same chunk as the timex # Example: "October elections" event = sentence[i].get_event() if event: eiid = event.attrs[EIID] tid = timex.attrs[TID] self.xmldoc.add_tlink('IS_INCLUDED', eiid, tid, "Blinker - Type 1") return # Pattern: [CHUNK-WITH-EVENT] Prep [CHUNK-WITH-TIMEX] if i > 1: event = sentence[i-2].get_event() if sentence[i-1].isPreposition() and event: signal = sentence[i-1].getText().lower() rel = self.rule2_index.get(signal) eiid = event.attrs[EIID] tid = timex.attrs[TID] if _DEBUG2: print "FOUND: [%s] %s [%s] --> %s" % \ (event.dtrs[0].getText(), signal, timex.getText(), rel) self.xmldoc.add_tlink(rel, eiid, tid, "Blinker - Type 2 (%s)" % signal) return # Pattern: [CHUNK-WITH-VERBAL-EVENT] [CHUNK-WITH_TIMEX] if i > 0: previous_chunk = sentence[i-1] if previous_chunk.isVerbChunk(): event = previous_chunk.get_event() if event: #if event.attrs[POL] != 'NEG': eiid = event.attrs[EIID] tid = timex.attrs[TID] self.xmldoc.add_tlink('IS_INCLUDED', eiid, tid, "Blinker - Type 1a") return def _apply_event_ordering_with_signal_rules(self): """Some more rules without using any rules, basically a placeholder for event ordering rules that use a signal.""" signal_mapping = { 'after': 'AFTER', 'before': 'BEFORE', 'during': 'DURING' } for si in range(len(self.doctree)): sentence = self.doctree[si] for i in range(len(sentence)): try: #print sentence[i:i+4] (VG1, Prep, NG, VG2) = sentence[i:i+4] event1 = VG1.get_event() event2 = VG2.get_event() # Pattern: [VG +Event] [Prep] [NG -Event] [VG +Event] if event1 and VG1.isVerbChunk() and \ Prep.isPreposition() and \ NG.isNounChunk() and not NG.get_event() and \ event2 and VG2.isVerbChunk(): #print "[VG +Event] [Prep] [NG -Event] [VG +Event]" #print Prep prep_token = Prep.getText().lower() #print prep_token rel = signal_mapping.get(prep_token) #print rel if rel: #print 'adding tlink' eiid1 = event1.attrs[EIID] eiid2 = event2.attrs[EIID] self.xmldoc.add_tlink(rel, eiid1, eiid2, "Blinker - Event:Signal:Event") except: pass
class HtmlGenerator: """An HtmlGenerator is created for an XML file with TimeML tags. It is used to create HTML files that display the text with events and times highlighted and links lined up with the sentences. It can also be used to create tables for events, times and links. Instance variables: file_name - an absolute path xmldoc - an XmlDocument sentences - a list instances - a mapping from eiid and eid to XmlDocElements that contain an instance links - a mapping from eids and tids to links that contain them""" def __init__(self, file_name): self.file_name = file_name file = open(file_name,'r') self.xmldoc = Parser().parse_file(file) self.sentences = [] self.instance_idx = {} self.event_idx = {} self.timex_idx = {} self.events = [] self.timexes = [] self.links = {} self._init_sentence_list() self._init_indexes() self._init_events_list() self._init_timexes_list() self._init_link_index() def _init_sentence_list(self): """Fill in the self.sentences list. Sentences do not include their opening and closing tags.""" for open_s_tag in self.xmldoc.get_tags('s'): close_s_tag = open_s_tag.get_closing_tag() elements = open_s_tag.get_slice_till(close_s_tag.id) self.sentences.append(Sentence(elements)) def _init_events_list(self): for event in self.xmldoc.get_tags('EVENT'): eid = event.attrs['eid'] instance = self.instance_idx[eid] self.events.append(Event(event, instance)) def _init_timexes_list(self): self.timexes = self.xmldoc.get_tags('TIMEX3') def _init_indexes(self): """The value for these indixes are always XML elements.""" for instance in self.xmldoc.get_tags('MAKEINSTANCE'): eiid = instance.attrs.get('eiid') eid = instance.attrs.get('eventID') self.instance_idx[eiid] = instance self.instance_idx[eid] = instance for event in self.xmldoc.get_tags('EVENT'): eid = event.attrs.get('eid') self.event_idx[eid] = event for timex in self.xmldoc.get_tags('TIMEX3'): tid = timex.attrs.get('tid') self.timex_idx[tid] = timex # need to add dct, but this needs to be done differently self.timex_idx['t0'] = create_dct_element('20080515') def _init_link_index(self): link_elements = \ self.xmldoc.get_tags('ALINK') + \ self.xmldoc.get_tags('SLINK') + \ self.xmldoc.get_tags('TLINK') for link_element in link_elements: link = Link(link_element, self.instance_idx) id = link.get_id(['eventInstanceID', 'timeID']) try: self.links[id].append(link) except KeyError: self.links[id] = [link] def create_file(self, outfile, creators): """Creates a file with all sentences and the links lined up with the sentences.""" fh = open(outfile, 'w') fh.write( "<html>\n<head><style>\n" + "s {display: block; text-decoration: none}\n" + "</style>\n<body>\n" + "<table cellpadding=4" + "<tr>\n <td>source\n <td>%s\n" % self.file_name + "<tr>\n <td>components\n <td>%s\n" % ' + '.join(creators) + "</table>\n\n" + "<hr>\n\n" + "<table cellspacing=7pt>\n" ) for sentence in self.sentences: fh.write("\n<tr>\n <td>") sentence.print_html(fh) fh.write(" <td>") for id in sentence.get_ids(): links = self.links.get(id,[]) for link in links: if link.creator in creators: fh.write(' ' + link.convert() + "<br/>\n") fh.write("\n</table>\n</body>\n</html>\n") def create_events_table(self, file): fh = open(file,'w') fh.write("<html>\n<body>\n<table cellpadding=4>\n" + "<tr align=\"left\">\n" + " <th bgcolor=\"#dddddd\">event\n" + " <th bgcolor=\"#dddddd\">pos\n" + " <th bgcolor=\"#dddddd\">class\n" + " <th bgcolor=\"#dddddd\">tense\n" + " <th bgcolor=\"#dddddd\">aspect\n" + " <th bgcolor=\"#dddddd\">polarity\n" + " <th bgcolor=\"#dddddd\">modality\n" + "</tr>\n") celltag = '<td bgcolor="#dddddd">' for event in self.events: fh.write("<tr>\n") eid = event.attrs['eid'] text = event.attrs['text'] pos = event.attrs['pos'].lower() eclass = event.attrs['class'].lower() tense = event.attrs['tense'].lower() aspect = event.attrs['aspect'].lower() polarity = event.attrs['polarity'].lower() modality = event.attrs.get('modality', '').lower() if tense == 'none': tense = '' if aspect == 'none': aspect = '' if polarity == 'pos': polarity = '' if aspect == 'perfective_progressive': aspect = 'perf_prog' fh.write(" %s<font color=red>%s_%s</font>\n" % (celltag, text, eid)) fh.write(" %s%s\n" % (celltag, pos)) fh.write(" %s%s\n" % (celltag, eclass)) fh.write(" %s%s\n" % (celltag, tense)) fh.write(" %s%s\n" % (celltag, aspect)) fh.write(" %s%s\n" % (celltag, polarity)) fh.write(" %s%s\n" % (celltag, modality)) fh.write("</table>\n</body>\n<html>\n") def create_timexes_table(self, file): fh = open(file,'w') fh.write("<html>\n<body>\n<table cellpadding=4>\n" + "<tr align=\"left\">\n" + " <th bgcolor=\"#dddddd\">timex\n" + " <th bgcolor=\"#dddddd\">type\n" + " <th bgcolor=\"#dddddd\">value\n" + "</tr>\n") celltag = '<td bgcolor="#dddddd">' for timex in self.timexes: fh.write("<tr>\n") tid = timex.attrs['tid'] text = timex.collect_content() type = timex.attrs['TYPE'].lower() value = timex.attrs.get('VAL') fh.write(" %s<font color=blue>%s_%s</font>\n" % (celltag, text, tid)) fh.write(" %s%s\n" % (celltag, type)) fh.write(" %s%s\n" % (celltag, str(value))) fh.write("</table>\n</body>\n<html>\n") def create_links_table(self, file): fh = open(file,'w') fh.write("<html>\n<body>\n<table cellpadding=4>\n") celltag = '<td bgcolor="#dddddd">' for sentence in self.sentences: for id in sentence.get_ids(): for link in self.links.get(id,[]): id1 = link.get_id1() id2 = link.get_id2() text1 = self._get_text(id1) text2 = self._get_text(id2) full_text1 = color_text(text1, id1) full_text2 = color_text(text2, id2) origin = link.attrs.get('origin') if not origin: origin = link.attrs.get('syntax') fh.write("<tr>\n") fh.write(" %s%s\n" % (celltag, full_text1)) fh.write(" %s%s\n" % (celltag, link.attrs['relType'])) fh.write(" %s%s\n" % (celltag, full_text2)) fh.write(" %s%s\n" % (celltag, origin)) fh.write("</tr>\n") fh.write("</table>\n</body>\n<html>\n") def _get_text(self, id): # this is a bit of a hack and needs to be solved elsewhere # the DCT has no tag and the cod eto collect data crashes if id == 't0': return 'DCT' if id.startswith('t'): timex = self.timex_idx[id] return timex.collect_content() if id.startswith('e'): instance = self.instance_idx[id] eid = instance.attrs['eventID'] event = self.event_idx[eid] return event.collect_content()
def check_abnormal_single(tlink_file): xml_document = Parser().parse_file(open(tlink_file, "r")) ee_tlinks = [] """ It doesn't ensure that any event that has a TLINK with the dct time need to be the main event in a sentence, but it's correct for the classified tlinks. """ main_events = {} for element in xml_document.get_tags(TLINK): # keep track of event order here if element.is_opening_tag(): if EVENT_INSTANCE_ID in element.attrs: eid = element.attrs[EVENT_INSTANCE_ID] if RELATED_TO_EVENT_INSTANCE in element.attrs: reid = element.attrs[RELATED_TO_EVENT_INSTANCE] if RELTYPE in element.attrs: ee_tlinks.append((eid, reid, element.attrs[RELTYPE])) if RELATED_TO_TIME in element.attrs: rtid = element.attrs[RELATED_TO_TIME] if RELTYPE in element.attrs: if rtid == 't0': """ Reverse the relation and the position of time and event so as the timeid is always the main entity """ main_events[eid] = (reverse( element.attrs[RELTYPE])) if TIME_ID in element.attrs: tid = element.attrs[TIME_ID] if RELATED_TO_EVENT_INSTANCE in element.attrs: reid = element.attrs[RELATED_TO_EVENT_INSTANCE] if RELTYPE in element.attrs: if tid == 't0': main_events[reid] = (element.attrs[RELTYPE]) total_number = 0 wrong_number = 0 for ee_tlink in ee_tlinks: if ee_tlink[0] in main_events and ee_tlink[1] in main_events: total_number += 1 """ They are all main events Relations between main events: AFTER, BEFORE and SIMULTANEOUS Relations between main event and dct time: AFTER, BEFORE and SIMULTANEOUS """ logging.info("=========================================") logging.info(ee_tlink[0]) logging.info(ee_tlink[1]) """ event 0 is main, related to dct time """ relation_1 = reverse(main_events[ee_tlink[0]]) logging.info(relation_1) """ dct time is main, related to event 1 """ relation_2 = main_events[ee_tlink[1]] logging.info(relation_2) """ event 0 is main, related to event 1 """ relation_3 = ee_tlink[2] logging.info(relation_3) if relation_3 in LOGIC_COMPOSITION[(relation_1, relation_2)]: logging.info("Satisfy constraint") else: wrong_number += 1 logging.warn("DOESN'T SATISFY") logging.info("Wrong number %d/ Total %d" % (wrong_number, total_number)) return (wrong_number, total_number)
def compare_performance_single(tlink_file, original_file): logging.info('================Compare======================') logging.info('Classified file: %s' % tlink_file) logging.info('Original file: %s' % original_file) """ Compare the performance of a classified file (that is the result of any algorithm or method), toward a destination gold file and the event pair temporal ordering provided by narrative scheme database. Only compare the performance between classified files, original files, and temporal ordering database with TLINKs that is: - TLINKS between two events. - Two events need to appear in both original file and classified file - Two events need to be found in the narrative scheme. """ xmldoc_classified = Parser().parse_file(open(tlink_file, "r")) xmldoc_original = Parser().parse_file(open(original_file, "r")) """ Verb event should be a dictionary to map between an event id and some other lemmas generated from the initial lemma. """ verb_events = {} """ EVENT tag sample <EVENT class="OCCURRENCE" eid="e1000028"> """ for element in xmldoc_classified.get_tags(EVENT): if element.is_opening_tag(): eid = element.attrs[EID] event_content = element.next.content synsets_event = None if len(wn.synsets(event_content, 'v')) > 0: synsets_event = wn.synsets(element.next.content, 'v')[0].lemma_names verb_morphy = wn.morphy(event_content, 'v') verb_events[eid] = { MORPHY_LEMMA: verb_morphy, SYNSET_LEMMA: synsets_event } """ <MAKEINSTANCE eventID="e2" polarity="POS" pos="VERB" eiid="ei2" tense="PRESENT" aspect="PERFECTIVE"> """ verb_event_instance = {} for element in xmldoc_classified.get_tags(INSTANCE): if element.is_opening_tag(): eiid = element.attrs[EIID] eid = element.attrs[EVENTID] if eid in verb_events: verb_event_instance[eiid] = verb_events[eid] """ All TLINKS in the document that are of two events """ ee_tlinks = [] for element in xmldoc_classified.get_tags(TLINK): # keep track of event order here if element.is_opening_tag(): lid = element.attrs[LID] if EVENT_INSTANCE_ID in element.attrs: eiid = element.attrs[EVENT_INSTANCE_ID] if RELATED_TO_EVENT_INSTANCE in element.attrs: reiid = element.attrs[RELATED_TO_EVENT_INSTANCE] if RELTYPE in element.attrs: if eiid in verb_event_instance and reiid in verb_event_instance: ee_tlinks.append( (eiid, reiid, (lid, element.attrs[RELTYPE]))) """ All TLINKs in the original document between two events Because excepts the TLINKs parts, original and classified documents should be identical, so they could use the same verb_event_instance. """ original_ee_tlinks = {} for element in xmldoc_original.get_tags(TLINK): # keep track of event order here if element.is_opening_tag(): lid = element.attrs[LID] if EVENT_INSTANCE_ID in element.attrs: eiid = element.attrs[EVENT_INSTANCE_ID] if RELATED_TO_EVENT_INSTANCE in element.attrs: reiid = element.attrs[RELATED_TO_EVENT_INSTANCE] if RELTYPE in element.attrs: if eiid in verb_event_instance and reiid in verb_event_instance: original_ee_tlinks[(eiid, reiid)] = ( lid, element.attrs[RELTYPE]) no_of_pair_in_database = 0 no_of_pair = 0 """ No of pairs of events that are found in classified files, original files and narrative scheme. """ no_of_compare_pairs = 0 """ No of pairs of events that are found in classified files, original files and narrative scheme and that doesn't have matching relation type between classified and original files """ no_of_incompatible_pairs = 0 no_of_verb_events = len(verb_events) for eiid, reiid, tlink in ee_tlinks: no_of_pair += 1 """ Here accessing verb_event_instance given the event instance id would give the result to be a dictionary of lemmas. """ lemma_dict_1 = verb_event_instance[eiid] lemma_dict_2 = verb_event_instance[reiid] logging.info( "---------------------------------------------------------------") logging.info("---------Classified file---------") logging.info("Tlink id in classified file is %s" % tlink[0]) relType = tlink[1] logging.info("Label in classified file is %s" % relType) if (eiid, reiid) in original_ee_tlinks: no_of_compare_pairs += 1 original_relation = original_ee_tlinks[(eiid, reiid)][1] logging.info("---------Original file---------") logging.info("Tlink id in classified file is %s" % original_ee_tlinks[(eiid, reiid)][0]) logging.info("Label in classified file is %s" % original_relation) if relType != original_relation: no_of_incompatible_pairs += 1 elif (reiid, eiid) in original_ee_tlinks: no_of_compare_pairs += 1 original_relation = reverse(original_ee_tlinks[(reiid, eiid)][1]) logging.info("---------Original file---------") logging.info("In the original file, the TLINK is %s" % original_relation) logging.info("Tlink in original file %s" % original_ee_tlinks[(reiid, eiid)][0]) if relType != original_relation: no_of_incompatible_pairs += 1 else: continue logging.info("---------LEMMA---------") if lemma_dict_1[MORPHY_LEMMA] != None and lemma_dict_2[ MORPHY_LEMMA] != None: v_1 = lemma_dict_1[MORPHY_LEMMA] v_2 = lemma_dict_2[MORPHY_LEMMA] if v_1 == v_2: continue try: result = narrative_checker.check_in_dict(v_1, v_2) if result != None: logging.info("%s, %s, %d, %d" % (v_1, v_2, result[0], result[1])) except Exception as e: logging.error(str(e)) logging.info("---------SYNSET---------") sum_result = [0, 0] if lemma_dict_1[SYNSET_LEMMA] != None and lemma_dict_2[ SYNSET_LEMMA] != None: v_1_list = lemma_dict_1[SYNSET_LEMMA] v_2_list = lemma_dict_2[SYNSET_LEMMA] for v_1, v_2 in itertools.product(v_1_list, v_2_list): if v_1 == v_2: continue try: result = narrative_checker.check_in_dict(v_1, v_2) if result != None: sum_result[0] += result[0] sum_result[1] += result[1] except Exception as e: logging.error(str(e)) logging.info( "%s\n %s\n %d, %d" % (str(v_1_list), str(v_2_list), sum_result[0], sum_result[1])) no_of_pair_in_database += 1 logging.info('Number of events tlink: %d' % no_of_pair) logging.info('Number of pairs in database : %d' % no_of_pair_in_database) logging.info('Percentage of pairs that are found \ in narrative scheme database: %.2f' % (float(no_of_pair_in_database) / no_of_pair)) logging.info('Number of pairs that are found in database\ and original files as well %d' % no_of_compare_pairs) logging.info('Number of pairs that are found in database\ and original files and not compatible %d' % no_of_incompatible_pairs) logging.info( '============================================================')