def _add_tlinks_to_fragment(self, in_fragment, tmp_fragment, out_fragment): """Take the links from the merged tlinks and add them into the fragment. Based on the method with the same name in the classifier wrapper.""" xmldoc1 = Parser().parse_file(open(in_fragment,'r')) xmldoc2 = Parser().parse_file(open(tmp_fragment,'r')) xmldoc1.remove_tags(TLINK) for tlink in xmldoc2.get_tags(TLINK): reltype = tlink.attrs[RELTYPE] id1 = tlink.attrs.get(EVENT_INSTANCE_ID, None) if not id1: id1 = tlink.attrs.get(TIME_ID, None) if not id1: logger.warn("Could not find id1 in " + tlink.content) id2 = tlink.attrs.get(RELATED_TO_EVENT_INSTANCE, None) if not id2: id2 = tlink.attrs.get(RELATED_TO_TIME, None) if not id2: logger.warn("Could not find id2 in " + tlink.content) #origin = CLASSIFIER + ' ' + tlink.attrs.get(CONFIDENCE,'') origin = tlink.attrs.get('origin','') xmldoc1.add_tlink(reltype, id1, id2, origin) xmldoc1.save_to_file(out_fragment)
def _create_vectors(self, in_fragment, ee_fragment, et_fragment, fragment): """New method that takes over the functionality of the old Perl script named prepareClassifier. UNDER CONSTRUCTION""" #print in_fragment ee_file = open(ee_fragment, 'w') et_file = open(et_fragment, 'w') #print fragment frag = Parser().parse_file(open(in_fragment, 'r')) # collect objects from the fragment events = frag.tags['EVENT'] instances = frag.tags['MAKEINSTANCE'] times = frag.tags['TIMEX3'] # add instance information to events eid2inst = {} for inst in instances: eid = inst.attrs.get('eventID', None) eid2inst[eid] = inst for event in events: eid = event.attrs.get('eid', None) inst = eid2inst[eid] for (key, val) in inst.attrs.items(): event.attrs[key] = val #event.attrs['instance'] = eid2inst[eid] #print event.attrs objects = times + events objects.sort(lambda a, b: cmp(a.id, b.id))
def merge_tags(infile1, infile2, merged_file): """Merge the tags from infile1, which has all tags from the input, with tags from infile2, which has only s, lex and TIMEX3 tags. The lex tags are used as the pivots and it is assumed that both files contain the same amount of lex tags.""" # create the document objects and add lex_id values to the lex tags doc1 = Parser().parse_file(open(infile1, "r")) doc2 = Parser().parse_file(open(infile2, "r")) _mark_lex_tags(doc1) _mark_lex_tags(doc2) # get the timexes and embedded lex tags from infile2, and create # index of the lex tags of infile1 using lex_id extended_timexes = _get_timextags_with_contained_lextags(doc2) lexid_to_lextag = _create_lexid_index(doc1) for extended_timex in extended_timexes: # get first and last document element of infile1 timex_tag = extended_timex[0] first_lex = extended_timex[1][0] last_lex = extended_timex[1][-1] first_element = lexid_to_lextag[first_lex] last_element = lexid_to_lextag[last_lex].get_closing_tag() # get the entire sequence that is to be embedded in the timex tag sequence = first_element.get_slice_till(last_element.id) sequence_string = '' for el in sequence: sequence_string = "%s%s" % (sequence_string, el.content) # check whether this sequence, when embedded in a tag, results # in well-formed XML, if so, add the new timex tag to infile1, # otherwise, ignore and print warning try: Parser().parse_string("<TAG>%s</TAG>" % sequence_string) # insert opening and closing timex tags first_element.insert_element_before(timex_tag) last_element.insert_element_after( XmlDocElement('</TIMEX3>', 'TIMEX3')) except ExpatError: logger.warn("Could not wrap TIMEX3 tag around\n\t %s" % sequence_string) # save the Document object of infile1 as the resulting merged file doc1.save_to_file(merged_file)
def retrieve_fragments(self, wrapping_tag=None): """Retrieve fragments and insert them into the tags that the fragments were extracted from. Arguments: wrapping_tag - name of the tag (if any) that was used by create_fragments to wrap the content of the fragment file, it needs to be removed in this method Unlike with create_fragments, there is no tag argument. This argument is not needed here because a fragment is a pair of a file base name tag and a DocElement that contains the tag.""" # NOTES: # - this method contains some dangerous code and it needs to be looked at. # - there is also a hack put in for ATEE documents, needs atention as well # - it should not just insert texts (which is xml) into tag chardata for fragment in self.fragments: #print self.fragments base = fragment[0] tag = fragment[1] in_file_name = "%s/%s.%s" % (self.DIR_DATA, base, self.RETRIEVAL_EXTENSION) in_file = open(in_file_name, 'r') text = in_file.read() # this was done for the ATEE documents, needs to be added as an option text = text.replace("\n", ' ') try: doc = Parser().parse_string(text) except ExpatError: doc = Parser().parse_string('<fragment>' + text + '</fragment>') wrapping_tag = 'fragment' if wrapping_tag: # This is dangerous since it can remove any fragment # Should check for occurrence of tag at begin and end # and remove first and last x characters if needed. # Use something like text.startswith("<fragment>\n"), text = text.replace("<%s>" % wrapping_tag, ' ') text = text.replace("</%s>" % wrapping_tag, ' ') doc.elements.pop() doc.elements.pop(0) #tag.replace_content(text) tag.replace_content_with_list(doc.elements)
def process_training_fragments(self): """Retrieve the TRAINING XmlDocument and hand it to the feature extracter for processing. Features file will be used for training""" os.chdir(self.DIR_CLASSIFIER) perl = self.tarsqi_instance.getopt_perl() fragment_count = 0 for fragment in self.fragments: base = fragment[0] fragment_count += 1 fin = os.path.join(self.DIR_DATA, base + '.' + self.CREATION_EXTENSION) ee_vectors = fin + '.EE' et_vectors = fin + '.ET' # tt_vectors = fin + '.TT' ee_train_vectors = fin + '.train.EE' et_train_vectors = fin + '.train.ET' # tt_train_vectors = fin + '.train.TT' fragment_doc = Parser().parse_file(open(fin, "r")) fragment_doc.set_dct_timex(self.document.get_dct()) # vectors.create_vectors(fragment_doc, ee_vectors, et_vectors, tt_vectors) # vectors.create_vectors(fragment_doc, ee_vectors, et_vectors) """ Without narrative scheme """ dictionary_file = os.path.join(self.DICT_DATA, 'feature_index.dict') """ With narrative scheme """ # dictionary_file = os.path.join( self.DICT_DATA, # 'feature_index_with_narrative_scheme.dict' ) feature_index_dict = Feature_Index_Dict() feature_index_dict.load_from_file(dictionary_file) """ Without narrative scheme """ tree_vectors.create_vectors(fragment_doc, self.auxillary[PARSED_DOCUMENT], feature_index_dict, ee_vectors, et_vectors) """ With narrative scheme """ # tree_vectors_with_narrative.create_vectors(fragment_doc, self.auxillary[PARSED_DOCUMENT], # feature_index_dict, ee_vectors, et_vectors) feature_index_dict.dump_to_file(dictionary_file) print 'done create vectors' # feature_recollect( self.document, ee_vectors, et_vectors, tt_vectors, # ee_train_vectors, et_train_vectors, tt_train_vectors) feature_recollect(self.document, ee_vectors, et_vectors, ee_train_vectors, et_train_vectors) print 'done collect training label and features' print '======================================================'
def merger(infile1, infile2, outfile): """ Merge two files together - infile1: String File name for the preprocessed file (with token tags) - infile2: String File name for the original file (including TIMEX3, MAKEINSTANCE, LINK) - outfile: String File name for the output file """ doc1 = Parser().parse_file(open(infile1, "r")) doc2 = Parser().parse_file(open(infile2, "r")) doc3 = XmlDocument() # _mark_lex_tags(doc1) _merge_tags(doc1, doc2, doc3) doc3.save_to_file(outfile)
def process(self, infile, outfile): """Process a fragment file and add TIMEX3 tags that were missed by Tempex. Arguments: infile - an absolute path outfile - an absolute path""" xmldoc = Parser().parse_file(open(infile,'r')) self.doctree = FragmentConverter(xmldoc, infile).convert() #self.print_doctree(BTIME) self.find_timexes() self.doctree.printOut(outfile)
def parse_xml(self): """Use the expat parser to read in the document. Takes the value of self.input_document and puts the result in self.xml_document.""" # NOTES: # - Uses a new parser each time when reading a new document. # Previously, this was done only once in an initialization # method, but at some point segmentation faults showed up that # could only be fixed by creating a new parser for each file. # - This method can probably be moved to the superclass. xmlfile = open(self.input_document, "r") self.xml_document = Parser().parse_file(xmlfile)
def _add_tlinks_to_fragment(self, in_fragment, tmp_fragment, out_fragment): """Takes the links created by the classifier and merges them into the input fragment.""" xmldoc1 = Parser().parse_file(open(in_fragment, 'r')) xmldoc2 = Parser().parse_file(open(tmp_fragment, 'r')) for tlink in xmldoc2.get_tags(TLINK): reltype = tlink.attrs[RELTYPE] id1 = tlink.attrs.get(EVENT_INSTANCE_ID, None) if not id1: id1 = tlink.attrs.get(TIME_ID, None) if not id1: logger.warn("Could not find id1 in " + tlink.content) id2 = tlink.attrs.get(RELATED_TO_EVENT_INSTANCE, None) if not id2: id2 = tlink.attrs.get(RELATED_TO_TIME, None) if not id2: logger.warn("Could not find id2 in " + tlink.content) origin = CLASSIFIER + ' ' + tlink.attrs.get(CONFIDENCE, '') xmldoc1.add_tlink(reltype, id1, id2, origin) xmldoc1.save_to_file(out_fragment)
def process(self, infile, outfile): """Apply all S2T rules to the input file. Parses the xml file with xml_parser.Parser and converts it to a shallow tree with converter.FragmentConverter. Then calls createTLinksFromSlinks.""" xmlfile = open(infile, "r") self.xmldoc = Parser().parse_file(xmlfile) self.doctree = FragmentConverter(self.xmldoc, infile).convert() #self.print_doctree(S2T) self.alinks = self.doctree.alink_list self.slinks = self.doctree.slink_list self.tlinks = self.doctree.tlink_list #self.createTLinksFromALinks() self.createTLinksFromSLinks() self.xmldoc.save_to_file(outfile)
def check_event_pair_in_doc(file): xmldoc = Parser().parse_file(open(file, "r")) checker = crd() verb_events = [] for element in xmldoc: if (element.tag == 'EVENT' and element.is_opening_tag()): prev_lex = element while prev_lex.tag != 'lex': prev_lex = prev_lex.previous if prev_lex.attrs['pos'][:2] == 'VB': if len(wn.synsets(element.next.content, 'v')) > 0: verb_event = wn.synsets(element.next.content, 'v')[0].lemma_names[0] verb_events.append(verb_event) print verb_events pair_in_database_counter = 0 pair_in_database = [] pair_in_database_with_some_certainty = [] print 'Number of verb events : ' + str(len(verb_events)) no_of_verb_events = len(verb_events) for i in xrange(no_of_verb_events): print i for j in xrange(i + 1, no_of_verb_events): v_1 = verb_events[i] v_2 = verb_events[j] if v_1 == v_2: continue try: result = checker.check_in_database(v_1, v_2) if result != None: pair_in_database_counter += 1 pair_in_database.append((v_1, v_2, result)) if result[0] > 3 * result[1] or result[1] > 3 * result[0]: pair_in_database_with_some_certainty.append( (v_1, v_2, result)) except Exception: print 'EXCEPTION' print 'Number of pairs in database : ' + str(len(pair_in_database)) print 'Percentage :' + str( float(len(pair_in_database)) / (no_of_verb_events * (no_of_verb_events - 1) / 2) * 100) + '%' print 'Number of pairs in database with some certainty of order : ' + str( len(pair_in_database_with_some_certainty)) print 'Percentage :' + str( float(len(pair_in_database_with_some_certainty)) / (no_of_verb_events * (no_of_verb_events - 1) / 2) * 100) + '%'
def __init__(self, file_name): self.file_name = file_name file = open(file_name, 'r') self.xmldoc = Parser().parse_file(file) self.sentences = [] self.instance_idx = {} self.event_idx = {} self.timex_idx = {} self.events = [] self.timexes = [] self.links = {} self._init_sentence_list() self._init_indexes() self._init_events_list() self._init_timexes_list() self._init_link_index()
def process_fragments(self): """Retrieve the XmlDocument and hand it to the classifier for processing. Processing will update this slice when tlinks are added.""" os.chdir(self.DIR_CLASSIFIER) perl = self.tarsqi_instance.getopt_perl() ee_model = os.path.join('data', 'op.e-e.model') et_model = os.path.join('data', 'op.e-t.model') fragment_count = 0 for fragment in self.fragments: base = fragment[0] fragment_count += 1 fin = os.path.join(self.DIR_DATA, base + '.' + self.CREATION_EXTENSION) ftmp = os.path.join(self.DIR_DATA, base + '.' + self.TMP_EXTENSION) fout = os.path.join(self.DIR_DATA, base + '.' + self.RETRIEVAL_EXTENSION) ee_vectors = fin + '.EE' et_vectors = fin + '.ET' ee_results = ee_vectors + '.REL' et_results = et_vectors + '.REL' fragment_doc = Parser().parse_file(open(fin, "r")) vectors.create_vectors(fragment_doc, ee_vectors, et_vectors) print 'done create vectors' commands = [ "./%s -input %s -model %s -output %s > class.log" % (self.executable, ee_vectors, ee_model, ee_results), "./%s -input %s -model %s -output %s > class.log" % (self.executable, et_vectors, et_model, et_results), "%s collectClassifier.pl %s %s %s" % (perl, ee_vectors, et_vectors, ftmp) ] for command in commands: os.system(command) print 'done create features' self._add_tlinks_to_fragment(fin, ftmp, fout)
def process(self, infile, outfile): """Process a fragment file and write a file with EVENT tags. Arguments: infile - an absolute path outfile - an absolute path""" use_old = True use_old = False if use_old: #logger.out('start event parser ', time.time()) self.doctree = parseFile(infile) #logger.out('end event parser ', time.time()) else: xmldoc = Parser().parse_file(open(infile, 'r')) # creating the document tree takes way too long, needs # to be optimized self.doctree = FragmentConverter(xmldoc, infile).convert() #xmldoc.pretty_print() #self.print_doctree(EVITA) self.extractEvents() self.doctree.printOut(outfile)
def parse(self, input_file, output_file): print ' ================= Parsing ===================' input_xml_doc = Parser().parse_file(open(input_file, "r")) inside_text = False plain_text = '' for element in input_xml_doc: if inside_text: if not element.is_tag() and not element.is_space(): original_text += plain_text if element.is_opening_tag() and element.tag == 'TEXT': inside_text = True if element.is_closing_tag() and element.tag == 'TEXT': inside_text = False result = nlp.parse(plain_text) shelf_file = sh.open(output_file) for key in result: shelf_file[key] = result[key] shelf_file.close()
def process(self, infile, outfile): """Run Slinket on the input file and write the results to the output file. Both input an doutput file are fragments. Uses the xml parser as well as the fragment converter to prepare the input and create the shallow tree that Slinket requires. Arguments: infile - an absolute path outfile - an absolute path""" use_old = True use_old = False if use_old: self.doctree = eventParser.readFileWithEvents(infile) else: xmldoc = Parser().parse_file(open(infile, 'r')) self.doctree = FragmentConverter(xmldoc, infile).convert(user=SLINKET) #self.print_doctree(SLINKET) #logger.debug("Number of sentences in file: " + str(len(self.doctree))) for sentence in self.doctree: self._find_links(self.doctree, sentence) self.doctree.printOut(outfile)
def parse(self, input_file, output_file): print ' ================= Parsing ===================' input_xml_doc = Parser().parse_file(open(input_file, "r")) inside_text = False plain_text = '' for element in input_xml_doc: if inside_text: if not element.is_tag() and not element.is_space(): plain_text += element.content if element.is_opening_tag() and element.tag == 'TEXT': inside_text = True if element.is_closing_tag() and element.tag == 'TEXT': inside_text = False # print plain_text plain_text = 'I love to eat.' result = self.__nlpParser__.parse(plain_text) shelf_file = sh.open(output_file) for key in result: shelf_file[key] = result[key] shelf_file.close()
def xml_tree(filename, tab=' ', stack=[]): """Takes an xml file, opens it, and creates a string that shows the XML tree.""" file = open(filename, 'r') tree_string = '' DOC = Parser().parse_file(file) for element in DOC.elements: if element.is_opening_tag(): stack.append(element) if element.is_tag(): indent = (len(stack) - 1) * tab else: indent = (len(stack) + 0) * tab content_string = element.content content_string = content_string.strip() if content_string.startswith('<TLINK') or \ content_string.startswith('<SLINK') or \ content_string.startswith('<MAKEINSTANCE'): content_string = content_string[:-2] + ' />' if content_string != '' and \ content_string not in ['</TLINK>', '</SLINK>', '</MAKEINSTANCE>']: str = "%s %s\n" % (indent, content_string) tree_string += str if element.is_closing_tag(): stack.pop() return tree_string
def incorporate_tlink_with_prior_correction(svm_histogram_file, no_tlink_directory, result_directory, tlink_directory, histogram_class, correction_method): histogram = histogram_class.load_histogram(svm_histogram_file) result_files = glob.glob( os.path.join(result_directory, '*%s' % RESULT_SUFFIX)) total_fix_label_counter = 0 total_worsen_label_counter = 0 prior = histogram.get_prior() """ Should be converging condition """ all_relation_collect = {} for i in xrange(2): print i logging.info('======================RUN========================') logging.info(i) for feature_type in prior: logging.info(feature_type) logging.info(prior[feature_type]) label_prob_collect = {} for result_file in result_files: new_relation_collect = correction_method(result_file, prior) for feature_type in new_relation_collect: if feature_type not in label_prob_collect: label_prob_collect[feature_type] = {} label_prob_collect[feature_type][ result_file] = new_relation_collect[feature_type] all_relation_collect[i] = label_prob_collect new_prior = {} new_prior_count = defaultdict(int) """ update prior here Update prior based on the posterior received from classifying each sample on test data. """ for feature_type in label_prob_collect: new_prior[feature_type] = defaultdict(float) for result_file in label_prob_collect[feature_type]: for line_counter in label_prob_collect[feature_type][ result_file]: probability, raw_relType, id_0, id_1 =\ label_prob_collect[feature_type][result_file][line_counter] new_prior_count[feature_type] += 1 max_value = max(probability.values()) # tf = sorted(probability.values()) if tf[0] == tf[1]: print probability for label in probability: new_prior[feature_type][label] += probability[label] for label in new_prior[feature_type]: new_prior[feature_type][label] /= new_prior_count[feature_type] """ Second way of update prior: Update prior on the real label assigned to each sample each iteration. It's the extremity version of first prior approach, by actually assign for each label a posterior of 1 for the most likely label. """ # for feature_type in label_prob_collect: # new_prior[feature_type] = defaultdict(float) # for result_file in label_prob_collect[feature_type]: # for line_counter in label_prob_collect[feature_type][result_file]: # probability, raw_relType, id_0, id_1 =\ # label_prob_collect[feature_type][result_file][line_counter] # new_prior_count[feature_type] += 1 # max_value = max(probability.values()) # # tf = sorted(probability.values()) # if tf[0] == tf[-1]: # new_prior[feature_type][raw_relType] += 1 # else: # for label in probability: # if probability[label] == max_value: # new_prior[feature_type][label] += 1 # break # for label in new_prior[feature_type]: # new_prior[feature_type][label] /= new_prior_count[feature_type] prior = new_prior """ Check this part first """ # return result_file_collect = {} for feature_type in label_prob_collect: if feature_type in []: for result_file in all_relation_collect[0][feature_type]: if result_file not in result_file_collect: result_file_collect[result_file] = {} result_file_collect[result_file][feature_type] =\ all_relation_collect[0][feature_type][result_file] else: for result_file in label_prob_collect[feature_type]: if result_file not in result_file_collect: result_file_collect[result_file] = {} result_file_collect[result_file][feature_type] =\ label_prob_collect[feature_type][result_file] for result_file in result_file_collect: rel_filename = result_file[result_file.rindex(os.path.sep) + 1:] no_tlink_file = os.path.join( no_tlink_directory, '%s%s' % (rel_filename[:-len(RESULT_SUFFIX)], NO_TLINK_SUFFIX)) tlink_file = os.path.join( tlink_directory, '%s%s' % (rel_filename[:-len(RESULT_SUFFIX)], ADD_TLINK_SUFFIX)) xml_document = Parser().parse_file(open(no_tlink_file, "r")) for feature_type in result_file_collect[result_file]: for line_counter in result_file_collect[result_file][feature_type]: (probability, raw_relType, id_0, id_1) =\ result_file_collect[result_file][feature_type][line_counter] tf = sorted(probability.values()) if tf[0] == tf[-1]: """ I should fix the label here to the label guessed by vote dict """ relType = raw_relType else: if tf[-1] == tf[-2]: print probability relType = sorted(probability.items(), key=lambda x: x[1])[-1][0] if relType != NORELATION: xml_document.add_tlink(relType, id_0, id_1, SVM_CLASSIFIER_ORIGIN) xml_document.save_to_file(tlink_file)
def check_abnormal_single(tlink_file): xml_document = Parser().parse_file(open(tlink_file, "r")) ee_tlinks = [] """ It doesn't ensure that any event that has a TLINK with the dct time need to be the main event in a sentence, but it's correct for the classified tlinks. """ main_events = {} for element in xml_document.get_tags(TLINK): # keep track of event order here if element.is_opening_tag(): if EVENT_INSTANCE_ID in element.attrs: eid = element.attrs[EVENT_INSTANCE_ID] if RELATED_TO_EVENT_INSTANCE in element.attrs: reid = element.attrs[RELATED_TO_EVENT_INSTANCE] if RELTYPE in element.attrs: ee_tlinks.append((eid, reid, element.attrs[RELTYPE])) if RELATED_TO_TIME in element.attrs: rtid = element.attrs[RELATED_TO_TIME] if RELTYPE in element.attrs: if rtid == 't0': """ Reverse the relation and the position of time and event so as the timeid is always the main entity """ main_events[eid] = (reverse( element.attrs[RELTYPE])) if TIME_ID in element.attrs: tid = element.attrs[TIME_ID] if RELATED_TO_EVENT_INSTANCE in element.attrs: reid = element.attrs[RELATED_TO_EVENT_INSTANCE] if RELTYPE in element.attrs: if tid == 't0': main_events[reid] = (element.attrs[RELTYPE]) total_number = 0 wrong_number = 0 for ee_tlink in ee_tlinks: if ee_tlink[0] in main_events and ee_tlink[1] in main_events: total_number += 1 """ They are all main events Relations between main events: AFTER, BEFORE and SIMULTANEOUS Relations between main event and dct time: AFTER, BEFORE and SIMULTANEOUS """ logging.info("=========================================") logging.info(ee_tlink[0]) logging.info(ee_tlink[1]) """ event 0 is main, related to dct time """ relation_1 = reverse(main_events[ee_tlink[0]]) logging.info(relation_1) """ dct time is main, related to event 1 """ relation_2 = main_events[ee_tlink[1]] logging.info(relation_2) """ event 0 is main, related to event 1 """ relation_3 = ee_tlink[2] logging.info(relation_3) if relation_3 in LOGIC_COMPOSITION[(relation_1, relation_2)]: logging.info("Satisfy constraint") else: wrong_number += 1 logging.warn("DOESN'T SATISFY") logging.info("Wrong number %d/ Total %d" % (wrong_number, total_number)) return (wrong_number, total_number)
def tlink_inject_with_prior_with_check(no_tlink_file, result_file, tlink_file, original_file): """ """ """ Verb event should be a dictionary to map between an event id and some other lemmas generated from the initial lemma. """ verb_events = {} """ EVENT tag sample <EVENT class="OCCURRENCE" eid="e1000028"> """ xml_document = Parser().parse_file(open(no_tlink_file, "r")) xmldoc_original = Parser().parse_file(open(original_file, "r")) for element in xml_document.get_tags(EVENT): if element.is_opening_tag(): eid = element.attrs[EID] event_content = element.next.content synsets_event = None if len(wn.synsets(event_content, 'v')) > 0: synsets_event = wn.synsets(element.next.content, 'v')[0].lemma_names verb_morphy = wn.morphy(event_content, 'v') verb_events[eid] = { MORPHY_LEMMA: verb_morphy, SYNSET_LEMMA: synsets_event } """ <MAKEINSTANCE eventID="e2" polarity="POS" pos="VERB" eiid="ei2" tense="PRESENT" aspect="PERFECTIVE"> """ verb_event_instance = {} for element in xml_document.get_tags(INSTANCE): if element.is_opening_tag(): eiid = element.attrs[EIID] eid = element.attrs[EVENTID] if eid in verb_events: verb_event_instance[eiid] = verb_events[eid] """ All TLINKs in the original document between two events Because excepts the TLINKs parts, original and classified documents should be identical, so they could use the same verb_event_instance. """ original_ee_tlinks = {} for element in xmldoc_original.get_tags(TLINK): # keep track of event order here if element.is_opening_tag(): lid = element.attrs[LID] if EVENT_INSTANCE_ID in element.attrs: eiid = element.attrs[EVENT_INSTANCE_ID] if RELATED_TO_EVENT_INSTANCE in element.attrs: reiid = element.attrs[RELATED_TO_EVENT_INSTANCE] if RELTYPE in element.attrs: if eiid in verb_event_instance and reiid in verb_event_instance: original_ee_tlinks[(eiid, reiid)] = ( lid, element.attrs[RELTYPE]) with open(result_file, 'r') as result_file: label_vote_dict = json.load(result_file) fix_label_counter = 0 worsen_label_counter = 0 for feature_type in label_vote_dict: for line_counter in label_vote_dict[feature_type]: result_dict = label_vote_dict[feature_type][line_counter][ RESULT_DICT] label_vote = label_vote_dict[feature_type][line_counter][VOTE_DICT] ids = label_vote_dict[feature_type][line_counter][TLINK_IDS_DICT] raw_relType = label_vote[-1][0] """ Have to re calculate the relType here - Calculate P ( label | lemma_pair, result_vector ) ~ P(label) x P ( result_vector | label ) x P ( lemma_pair | label ) """ # if raw_relType == NORELATION or raw_relType == SIMULTANEOUS: if raw_relType == NORELATION: pass else: def check_event_pair(ids): for id in ids: if id[1] == TID: return False return True """ If the relation is between event pairs, we check the narrative scheme, else, just use the raw_relType for TLink between time and event. """ new_ids = {} for id in ids: if id[1] in [TID, EIID]: new_ids[id[0]] = id[2] original_relation = None if (new_ids['0'], new_ids['1']) in original_ee_tlinks: original_relation = original_ee_tlinks[(new_ids['0'], new_ids['1'])][1] elif (new_ids['1'], new_ids['0']) in original_ee_tlinks: original_relation = reverse( original_ee_tlinks[(new_ids['1'], new_ids['0'])][1]) """ Eleventh try Only consider main event pairs inter sentences """ if check_event_pair(ids): probability = {} max_label = None max_prob = None """ Third approach: only consider labels inside the votes """ result_prob = {} label_prob = {} lemma_pair_prob = {} for label in [str(label[0]) for label in label_vote]: if not label in [BEFORE, AFTER, SIMULTANEOUS]: continue """ 15th try: only fix BEFORE and AFTER labels """ # if not label in [BEFORE, AFTER]: # continue # for label in [BEFORE, AFTER, SIMULTANEOUS]: probability[label] = 1 result_prob[label] = histogram.get_probability_vector( result_dict, label) probability[label] *= result_prob[label] """ First approach: only use the morphy lemma """ morphy_1 = verb_event_instance[ new_ids['0']][MORPHY_LEMMA] morphy_2 = verb_event_instance[ new_ids['1']][MORPHY_LEMMA] # lemma_pair_prob[label] = crd.get_lemma_pair_prob((morphy_1,morphy_2,label)) # lemma_pair_prob[label] = crd.get_lemma_pair_prob_smoothing((morphy_1,morphy_2),label) """ Tenth approach: desperate try, multiply all of them together """ # probability[label] *= lemma_pair_prob[label] """ Done first approach """ """ Second approach: use all pairs of lemmas with lemma in corresponding two synsets """ lemma_pair_prob[label] = 0 synset_1 = verb_event_instance[ new_ids['0']][SYNSET_LEMMA] synset_2 = verb_event_instance[ new_ids['1']][SYNSET_LEMMA] if synset_1 != None and synset_2 != None: for l_1, l_2 in itertools.product( synset_1, synset_2): lemma_pair_prob[ label] += crd.get_lemma_pair_prob_smoothing( (l_1, l_2), label) # lemma_pair_prob[label] += crd.get_lemma_pair_prob((l_1,l_2),label) """ Done second approach """ """ Seventh try: turn off lemma pairs """ probability[label] *= lemma_pair_prob[label] label_prob[label] = histogram.get_probability_label( label) # """ # 14th try: normalize BEFORE and AFTER labels # """ # if label == BEFORE or label == AFTER: # label_prob[label] = (histogram.get_probability_label (BEFORE) # + histogram.get_probability_label (AFTER))/2 """ 13 rd try: disable label prob """ probability[label] *= label_prob[label] if max_prob == None or max_prob < probability[label]: max_prob = probability[label] max_label = label """ Forth try: if max_prob == 0, it means that all probabilities = 0 and we should follow the initialy vote """ if max_prob == 0: relType = raw_relType else: relType = max_label need_to_keep_track = False if (relType == raw_relType and original_relation != None and original_relation != relType and original_relation in [BEFORE, AFTER, SIMULTANEOUS]): need_to_keep_track = True logging.info( '---------------DOESNT HELP----------------') if relType != raw_relType and original_relation != None: need_to_keep_track = True if (original_relation == relType): fix_label_counter += 1 if (original_relation == raw_relType): worsen_label_counter += 1 logging.info( '---------------MAKE CHANGE----------------') if need_to_keep_track: logging.info('Correct relation : %s' % original_relation) logging.info('Original classified : %s' % raw_relType) logging.info('Prior classified : %s' % relType) logging.info(morphy_1) logging.info(morphy_2) logging.info(synset_1) logging.info(synset_2) logging.info('--result_prob--') logging.info(result_prob) logging.info('--label_prob--') logging.info(label_prob) logging.info('--lemma_pair_prob--') logging.info(lemma_pair_prob) logging.info(probability) logging.info('==============================') else: relType = raw_relType xml_document.add_tlink(relType, new_ids['0'], new_ids['1'], SVM_CLASSIFIER_ORIGIN) xml_document.save_to_file(tlink_file) return (fix_label_counter, worsen_label_counter)
def compare_performance_single(tlink_file, original_file): logging.info('================Compare======================') logging.info('Classified file: %s' % tlink_file) logging.info('Original file: %s' % original_file) """ Compare the performance of a classified file (that is the result of any algorithm or method), toward a destination gold file and the event pair temporal ordering provided by narrative scheme database. Only compare the performance between classified files, original files, and temporal ordering database with TLINKs that is: - TLINKS between two events. - Two events need to appear in both original file and classified file - Two events need to be found in the narrative scheme. """ xmldoc_classified = Parser().parse_file(open(tlink_file, "r")) xmldoc_original = Parser().parse_file(open(original_file, "r")) """ Verb event should be a dictionary to map between an event id and some other lemmas generated from the initial lemma. """ verb_events = {} """ EVENT tag sample <EVENT class="OCCURRENCE" eid="e1000028"> """ for element in xmldoc_classified.get_tags(EVENT): if element.is_opening_tag(): eid = element.attrs[EID] event_content = element.next.content synsets_event = None if len(wn.synsets(event_content, 'v')) > 0: synsets_event = wn.synsets(element.next.content, 'v')[0].lemma_names verb_morphy = wn.morphy(event_content, 'v') verb_events[eid] = { MORPHY_LEMMA: verb_morphy, SYNSET_LEMMA: synsets_event } """ <MAKEINSTANCE eventID="e2" polarity="POS" pos="VERB" eiid="ei2" tense="PRESENT" aspect="PERFECTIVE"> """ verb_event_instance = {} for element in xmldoc_classified.get_tags(INSTANCE): if element.is_opening_tag(): eiid = element.attrs[EIID] eid = element.attrs[EVENTID] if eid in verb_events: verb_event_instance[eiid] = verb_events[eid] """ All TLINKS in the document that are of two events """ ee_tlinks = [] for element in xmldoc_classified.get_tags(TLINK): # keep track of event order here if element.is_opening_tag(): lid = element.attrs[LID] if EVENT_INSTANCE_ID in element.attrs: eiid = element.attrs[EVENT_INSTANCE_ID] if RELATED_TO_EVENT_INSTANCE in element.attrs: reiid = element.attrs[RELATED_TO_EVENT_INSTANCE] if RELTYPE in element.attrs: if eiid in verb_event_instance and reiid in verb_event_instance: ee_tlinks.append( (eiid, reiid, (lid, element.attrs[RELTYPE]))) """ All TLINKs in the original document between two events Because excepts the TLINKs parts, original and classified documents should be identical, so they could use the same verb_event_instance. """ original_ee_tlinks = {} for element in xmldoc_original.get_tags(TLINK): # keep track of event order here if element.is_opening_tag(): lid = element.attrs[LID] if EVENT_INSTANCE_ID in element.attrs: eiid = element.attrs[EVENT_INSTANCE_ID] if RELATED_TO_EVENT_INSTANCE in element.attrs: reiid = element.attrs[RELATED_TO_EVENT_INSTANCE] if RELTYPE in element.attrs: if eiid in verb_event_instance and reiid in verb_event_instance: original_ee_tlinks[(eiid, reiid)] = ( lid, element.attrs[RELTYPE]) no_of_pair_in_database = 0 no_of_pair = 0 """ No of pairs of events that are found in classified files, original files and narrative scheme. """ no_of_compare_pairs = 0 """ No of pairs of events that are found in classified files, original files and narrative scheme and that doesn't have matching relation type between classified and original files """ no_of_incompatible_pairs = 0 no_of_verb_events = len(verb_events) for eiid, reiid, tlink in ee_tlinks: no_of_pair += 1 """ Here accessing verb_event_instance given the event instance id would give the result to be a dictionary of lemmas. """ lemma_dict_1 = verb_event_instance[eiid] lemma_dict_2 = verb_event_instance[reiid] logging.info( "---------------------------------------------------------------") logging.info("---------Classified file---------") logging.info("Tlink id in classified file is %s" % tlink[0]) relType = tlink[1] logging.info("Label in classified file is %s" % relType) if (eiid, reiid) in original_ee_tlinks: no_of_compare_pairs += 1 original_relation = original_ee_tlinks[(eiid, reiid)][1] logging.info("---------Original file---------") logging.info("Tlink id in classified file is %s" % original_ee_tlinks[(eiid, reiid)][0]) logging.info("Label in classified file is %s" % original_relation) if relType != original_relation: no_of_incompatible_pairs += 1 elif (reiid, eiid) in original_ee_tlinks: no_of_compare_pairs += 1 original_relation = reverse(original_ee_tlinks[(reiid, eiid)][1]) logging.info("---------Original file---------") logging.info("In the original file, the TLINK is %s" % original_relation) logging.info("Tlink in original file %s" % original_ee_tlinks[(reiid, eiid)][0]) if relType != original_relation: no_of_incompatible_pairs += 1 else: continue logging.info("---------LEMMA---------") if lemma_dict_1[MORPHY_LEMMA] != None and lemma_dict_2[ MORPHY_LEMMA] != None: v_1 = lemma_dict_1[MORPHY_LEMMA] v_2 = lemma_dict_2[MORPHY_LEMMA] if v_1 == v_2: continue try: result = narrative_checker.check_in_dict(v_1, v_2) if result != None: logging.info("%s, %s, %d, %d" % (v_1, v_2, result[0], result[1])) except Exception as e: logging.error(str(e)) logging.info("---------SYNSET---------") sum_result = [0, 0] if lemma_dict_1[SYNSET_LEMMA] != None and lemma_dict_2[ SYNSET_LEMMA] != None: v_1_list = lemma_dict_1[SYNSET_LEMMA] v_2_list = lemma_dict_2[SYNSET_LEMMA] for v_1, v_2 in itertools.product(v_1_list, v_2_list): if v_1 == v_2: continue try: result = narrative_checker.check_in_dict(v_1, v_2) if result != None: sum_result[0] += result[0] sum_result[1] += result[1] except Exception as e: logging.error(str(e)) logging.info( "%s\n %s\n %d, %d" % (str(v_1_list), str(v_2_list), sum_result[0], sum_result[1])) no_of_pair_in_database += 1 logging.info('Number of events tlink: %d' % no_of_pair) logging.info('Number of pairs in database : %d' % no_of_pair_in_database) logging.info('Percentage of pairs that are found \ in narrative scheme database: %.2f' % (float(no_of_pair_in_database) / no_of_pair)) logging.info('Number of pairs that are found in database\ and original files as well %d' % no_of_compare_pairs) logging.info('Number of pairs that are found in database\ and original files and not compatible %d' % no_of_incompatible_pairs) logging.info( '============================================================')