Пример #1
0
    def process_training_fragments(self):
        """Retrieve the TRAINING XmlDocument and hand it to the feature extracter for processing. 
        Features file will be used for training"""

        os.chdir(self.DIR_CLASSIFIER)
        perl = self.tarsqi_instance.getopt_perl()

        fragment_count = 0

        for fragment in self.fragments:
            base = fragment[0]
            fragment_count += 1

            fin = os.path.join(self.DIR_DATA,
                               base + '.' + self.CREATION_EXTENSION)

            ee_vectors = fin + '.EE'
            et_vectors = fin + '.ET'
            #             tt_vectors = fin + '.TT'
            ee_train_vectors = fin + '.train.EE'
            et_train_vectors = fin + '.train.ET'
            #             tt_train_vectors = fin + '.train.TT'

            fragment_doc = Parser().parse_file(open(fin, "r"))
            fragment_doc.set_dct_timex(self.document.get_dct())
            #             vectors.create_vectors(fragment_doc, ee_vectors, et_vectors, tt_vectors)
            #             vectors.create_vectors(fragment_doc, ee_vectors, et_vectors)
            """
            Without narrative scheme
            """
            dictionary_file = os.path.join(self.DICT_DATA,
                                           'feature_index.dict')
            """
            With narrative scheme
            """
            #             dictionary_file = os.path.join( self.DICT_DATA,
            #                                             'feature_index_with_narrative_scheme.dict' )
            feature_index_dict = Feature_Index_Dict()
            feature_index_dict.load_from_file(dictionary_file)
            """
            Without narrative scheme
            """
            tree_vectors.create_vectors(fragment_doc,
                                        self.auxillary[PARSED_DOCUMENT],
                                        feature_index_dict, ee_vectors,
                                        et_vectors)
            """
            With narrative scheme
            """
            #             tree_vectors_with_narrative.create_vectors(fragment_doc, self.auxillary[PARSED_DOCUMENT],
            #                                         feature_index_dict, ee_vectors, et_vectors)
            feature_index_dict.dump_to_file(dictionary_file)
            print 'done create vectors'
            #             feature_recollect( self.document, ee_vectors, et_vectors, tt_vectors,
            #                                ee_train_vectors, et_train_vectors, tt_train_vectors)
            feature_recollect(self.document, ee_vectors, et_vectors,
                              ee_train_vectors, et_train_vectors)
            print 'done collect training label and features'
            print '======================================================'
Пример #2
0
class Slink2Tlink (TarsqiComponent):

    """Implements the S2T component of Tarsqi.
    S2T takes the output of the Slinket component and applies rules to the
    Slinks to create new Tlinks.

    Instance variables:
       NAME - a string
       rules - an S2TRuleDictionary"""

    def __init__(self):
        """Set component name and load rules into an S2TRuleDictionary object.
        This object knows where the rules are stored."""
        self.NAME = S2T
        self.rules = read_rules()

    def process(self, infile, outfile):
        """Apply all S2T rules to the input file.
        Parses the xml file with xml_parser.Parser and converts it to a shallow tree
        with converter.FragmentConverter.  Then calls createTLinksFromSlinks."""
        xmlfile = open(infile, "r")
        self.xmldoc = Parser().parse_file(xmlfile)
        self.doctree = FragmentConverter(self.xmldoc, infile).convert()
        #self.print_doctree(S2T)
        self.alinks = self.doctree.alink_list
        self.slinks = self.doctree.slink_list
        self.tlinks = self.doctree.tlink_list
        #self.createTLinksFromALinks()
        self.createTLinksFromSLinks()
        self.xmldoc.save_to_file(outfile)
            
    def createTLinksFromALinks(self):
        """Calls alink.lookForAtlinks to add Tlinks from Alinks. This is
        rather moronic unfortunately because it will never do anything
        because at the time of application there are no tlinks in the
        document. Needs to be separated out and apply at a later
        processing stage, after all other tlinking."""
        logger.debug("Number of ALINKs in file: "+str(len(self.alinks)))
        for alinkTag in self.alinks:
            try:
                alink = Alink(self.xmldoc, self.doctree, alinkTag)
                alink.lookForAtlinks()
            except:
                logger.error("Error processing ALINK")
                
    def createTLinksFromSLinks(self):
        """Calls lookForStlinks for a given Slink object."""
        logger.debug("Number of SLINKs in file: "+str(len(self.slinks)))
        for slinkTag in self.slinks:
            try:
                slink = Slink(self.xmldoc, self.doctree, slinkTag)
                slink.match_rules(self.rules)
            except:
                logger.error("Error processing SLINK")
Пример #3
0
class Slink2Tlink(TarsqiComponent):
    """Implements the S2T component of Tarsqi.
    S2T takes the output of the Slinket component and applies rules to the
    Slinks to create new Tlinks.

    Instance variables:
       NAME - a string
       rules - an S2TRuleDictionary"""
    def __init__(self):
        """Set component name and load rules into an S2TRuleDictionary object.
        This object knows where the rules are stored."""
        self.NAME = S2T
        self.rules = read_rules()

    def process(self, infile, outfile):
        """Apply all S2T rules to the input file.
        Parses the xml file with xml_parser.Parser and converts it to a shallow tree
        with converter.FragmentConverter.  Then calls createTLinksFromSlinks."""
        xmlfile = open(infile, "r")
        self.xmldoc = Parser().parse_file(xmlfile)
        self.doctree = FragmentConverter(self.xmldoc, infile).convert()
        #self.print_doctree(S2T)
        self.alinks = self.doctree.alink_list
        self.slinks = self.doctree.slink_list
        self.tlinks = self.doctree.tlink_list
        #self.createTLinksFromALinks()
        self.createTLinksFromSLinks()
        self.xmldoc.save_to_file(outfile)

    def createTLinksFromALinks(self):
        """Calls alink.lookForAtlinks to add Tlinks from Alinks. This is
        rather moronic unfortunately because it will never do anything
        because at the time of application there are no tlinks in the
        document. Needs to be separated out and apply at a later
        processing stage, after all other tlinking."""
        logger.debug("Number of ALINKs in file: " + str(len(self.alinks)))
        for alinkTag in self.alinks:
            try:
                alink = Alink(self.xmldoc, self.doctree, alinkTag)
                alink.lookForAtlinks()
            except:
                logger.error("Error processing ALINK")

    def createTLinksFromSLinks(self):
        """Calls lookForStlinks for a given Slink object."""
        logger.debug("Number of SLINKs in file: " + str(len(self.slinks)))
        for slinkTag in self.slinks:
            try:
                slink = Slink(self.xmldoc, self.doctree, slinkTag)
                slink.match_rules(self.rules)
            except:
                logger.error("Error processing SLINK")
Пример #4
0
 def process(self, infile, outfile):
     """Apply all S2T rules to the input file.
     Parses the xml file with xml_parser.Parser and converts it to a shallow tree
     with converter.FragmentConverter.  Then calls createTLinksFromSlinks."""
     xmlfile = open(infile, "r")
     self.xmldoc = Parser().parse_file(xmlfile)
     self.doctree = FragmentConverter(self.xmldoc, infile).convert()
     #self.print_doctree(S2T)
     self.alinks = self.doctree.alink_list
     self.slinks = self.doctree.slink_list
     self.tlinks = self.doctree.tlink_list
     #self.createTLinksFromALinks()
     self.createTLinksFromSLinks()
     self.xmldoc.save_to_file(outfile)
Пример #5
0
 def __init__(self, file_name):
     self.file_name = file_name
     file = open(file_name, 'r')
     self.xmldoc = Parser().parse_file(file)
     self.sentences = []
     self.instance_idx = {}
     self.event_idx = {}
     self.timex_idx = {}
     self.events = []
     self.timexes = []
     self.links = {}
     self._init_sentence_list()
     self._init_indexes()
     self._init_events_list()
     self._init_timexes_list()
     self._init_link_index()
Пример #6
0
    def _create_vectors(self, in_fragment, ee_fragment, et_fragment, fragment):
        """New method that takes over the functionality of the old
        Perl script named prepareClassifier.

        UNDER CONSTRUCTION"""

        #print in_fragment
        ee_file = open(ee_fragment, 'w')
        et_file = open(et_fragment, 'w')
        #print fragment
        frag = Parser().parse_file(open(in_fragment, 'r'))

        # collect objects from the fragment
        events = frag.tags['EVENT']
        instances = frag.tags['MAKEINSTANCE']
        times = frag.tags['TIMEX3']

        # add instance information to events
        eid2inst = {}
        for inst in instances:
            eid = inst.attrs.get('eventID', None)
            eid2inst[eid] = inst
        for event in events:
            eid = event.attrs.get('eid', None)
            inst = eid2inst[eid]
            for (key, val) in inst.attrs.items():
                event.attrs[key] = val
            #event.attrs['instance'] = eid2inst[eid]
            #print event.attrs

        objects = times + events
        objects.sort(lambda a, b: cmp(a.id, b.id))
Пример #7
0
    def retrieve_fragments(self, wrapping_tag=None):
        """Retrieve fragments and insert them into the tags that the fragments
        were extracted from.

        Arguments:
           wrapping_tag - name of the tag (if any) that was used by
                          create_fragments to wrap the content of the
                          fragment file, it needs to be removed in this method

        Unlike with create_fragments, there is no tag argument. This
        argument is not needed here because a fragment is a pair of a
        file base name tag and a DocElement that contains the tag."""

        # NOTES:
        # - this method contains some dangerous code and it needs to be looked at.
        # - there is also a hack put in for ATEE documents, needs atention as well
        # - it should not just insert texts (which is xml) into tag chardata

        for fragment in self.fragments:
            #print self.fragments
            base = fragment[0]
            tag = fragment[1]
            in_file_name = "%s/%s.%s" % (self.DIR_DATA, base,
                                         self.RETRIEVAL_EXTENSION)
            in_file = open(in_file_name, 'r')
            text = in_file.read()
            # this was done for the ATEE documents, needs to be added as an option
            text = text.replace("\n", ' ')
            try:
                doc = Parser().parse_string(text)
            except ExpatError:
                doc = Parser().parse_string('<fragment>' + text +
                                            '</fragment>')
                wrapping_tag = 'fragment'

            if wrapping_tag:
                # This is dangerous since it can remove any fragment
                # Should check for occurrence of tag at begin and end
                # and remove first and last x characters if needed.
                # Use something like text.startswith("<fragment>\n"),
                text = text.replace("<%s>" % wrapping_tag, ' ')
                text = text.replace("</%s>" % wrapping_tag, ' ')
                doc.elements.pop()
                doc.elements.pop(0)

            #tag.replace_content(text)
            tag.replace_content_with_list(doc.elements)
Пример #8
0
def merge_tags(infile1, infile2, merged_file):

    """Merge the tags from infile1, which has all tags from the input,
    with tags from infile2, which has only s, lex and TIMEX3 tags. The
    lex tags are used as the pivots and it is assumed that both files
    contain the same amount of lex tags."""

    # create the document objects and add lex_id values to the lex tags
    doc1 = Parser().parse_file(open(infile1,"r"))
    doc2 = Parser().parse_file(open(infile2,"r"))
    _mark_lex_tags(doc1)
    _mark_lex_tags(doc2)

    # get the timexes and embedded lex tags from infile2, and create
    # index of the lex tags of infile1 using lex_id
    extended_timexes = _get_timextags_with_contained_lextags(doc2)
    lexid_to_lextag = _create_lexid_index(doc1)
    

    for extended_timex in extended_timexes:

        # get first and last document element of infile1
        timex_tag = extended_timex[0]
        first_lex = extended_timex[1][0]
        last_lex = extended_timex[1][-1]
        first_element = lexid_to_lextag[first_lex]
        last_element = lexid_to_lextag[last_lex].get_closing_tag()

        # get the entire sequence that is to be embedded in the timex tag
        sequence = first_element.get_slice_till(last_element.id)
        sequence_string = ''
        for el in sequence:
            sequence_string = "%s%s" % (sequence_string, el.content)
        
        # check whether this sequence, when embedded in a tag, results
        # in well-formed XML, if so, add the new timex tag to infile1,
        # otherwise, ignore and print warning
        try:
            Parser().parse_string("<TAG>%s</TAG>" % sequence_string)
            # insert opening and closing timex tags
            first_element.insert_element_before(timex_tag)
            last_element.insert_element_after(XmlDocElement('</TIMEX3>', 'TIMEX3'))
        except ExpatError:
            logger.warn("Could not wrap TIMEX3 tag around\n\t %s" % sequence_string)

    # save the Document object of infile1 as the resulting merged file
    doc1.save_to_file(merged_file)
def merger(infile1, infile2, outfile):
    """
    Merge two files together
    - infile1: String 
    File name for the preprocessed file (with token tags)
    - infile2: String
    File name for the original file (including TIMEX3, MAKEINSTANCE, LINK)
    - outfile: String
    File name for the output file 
    """

    doc1 = Parser().parse_file(open(infile1, "r"))
    doc2 = Parser().parse_file(open(infile2, "r"))
    doc3 = XmlDocument()

    #     _mark_lex_tags(doc1)
    _merge_tags(doc1, doc2, doc3)

    doc3.save_to_file(outfile)
Пример #10
0
 def process(self, infile, outfile):
     """Process a fragment file and add TIMEX3 tags that were
     missed by Tempex.
     Arguments:
        infile - an absolute path
        outfile - an absolute path"""
     xmldoc = Parser().parse_file(open(infile,'r'))
     self.doctree = FragmentConverter(xmldoc, infile).convert()
     #self.print_doctree(BTIME)
     self.find_timexes()
     self.doctree.printOut(outfile)
Пример #11
0
 def parse_xml(self):
     """Use the expat parser to read in the document. Takes the
     value of self.input_document and puts the result in
     self.xml_document."""
     # NOTES:
     # - Uses a new parser each time when reading a new document.
     #   Previously, this was done only once in an initialization
     #   method, but at some point segmentation faults showed up that
     #   could only be fixed by creating a new parser for each file.
     # - This method can probably be moved to the superclass.
     xmlfile = open(self.input_document, "r")
     self.xml_document = Parser().parse_file(xmlfile)
Пример #12
0
 def process(self, infile, outfile):
     """Apply all S2T rules to the input file.
     Parses the xml file with xml_parser.Parser and converts it to a shallow tree
     with converter.FragmentConverter.  Then calls createTLinksFromSlinks."""
     xmlfile = open(infile, "r")
     self.xmldoc = Parser().parse_file(xmlfile)
     self.doctree = FragmentConverter(self.xmldoc, infile).convert()
     #self.print_doctree(S2T)
     self.alinks = self.doctree.alink_list
     self.slinks = self.doctree.slink_list
     self.tlinks = self.doctree.tlink_list
     #self.createTLinksFromALinks()
     self.createTLinksFromSLinks()
     self.xmldoc.save_to_file(outfile)
Пример #13
0
def check_event_pair_in_doc(file):
    xmldoc = Parser().parse_file(open(file, "r"))
    checker = crd()

    verb_events = []
    for element in xmldoc:
        if (element.tag == 'EVENT' and element.is_opening_tag()):
            prev_lex = element
            while prev_lex.tag != 'lex':
                prev_lex = prev_lex.previous
            if prev_lex.attrs['pos'][:2] == 'VB':
                if len(wn.synsets(element.next.content, 'v')) > 0:
                    verb_event = wn.synsets(element.next.content,
                                            'v')[0].lemma_names[0]
                    verb_events.append(verb_event)
    print verb_events
    pair_in_database_counter = 0
    pair_in_database = []
    pair_in_database_with_some_certainty = []

    print 'Number of verb events : ' + str(len(verb_events))

    no_of_verb_events = len(verb_events)
    for i in xrange(no_of_verb_events):
        print i
        for j in xrange(i + 1, no_of_verb_events):
            v_1 = verb_events[i]
            v_2 = verb_events[j]
            if v_1 == v_2:
                continue
            try:
                result = checker.check_in_database(v_1, v_2)
                if result != None:
                    pair_in_database_counter += 1
                    pair_in_database.append((v_1, v_2, result))
                    if result[0] > 3 * result[1] or result[1] > 3 * result[0]:
                        pair_in_database_with_some_certainty.append(
                            (v_1, v_2, result))
            except Exception:
                print 'EXCEPTION'
    print 'Number of pairs in database : ' + str(len(pair_in_database))
    print 'Percentage :' + str(
        float(len(pair_in_database)) /
        (no_of_verb_events * (no_of_verb_events - 1) / 2) * 100) + '%'
    print 'Number of pairs in database with some certainty of order : ' + str(
        len(pair_in_database_with_some_certainty))
    print 'Percentage :' + str(
        float(len(pair_in_database_with_some_certainty)) /
        (no_of_verb_events * (no_of_verb_events - 1) / 2) * 100) + '%'
Пример #14
0
 def __init__(self, file_name):
     self.file_name = file_name
     file = open(file_name,'r')
     self.xmldoc = Parser().parse_file(file)
     self.sentences = []
     self.instance_idx = {}
     self.event_idx = {}
     self.timex_idx = {}
     self.events = []
     self.timexes = []
     self.links = {}
     self._init_sentence_list()
     self._init_indexes()
     self._init_events_list()
     self._init_timexes_list()
     self._init_link_index()
Пример #15
0
 def process(self, infile, outfile, dct):
     """Apply all Blinker rules to the input file. Parses the xml
     file with xml_parser.Parser and converts it to a shallow tree
     with converter.FragmentConverter. Then applies the Blinker
     rules. Curently only applies rules of type 2.
     Arguments
        infile - an absolute path
        outfile - an absolute path
     No return value."""
     xmlfile = open(infile, "r")
     self.dct = dct
     self.xmldoc = Parser().parse_file(xmlfile)
     self.doctree = FragmentConverter(self.xmldoc, infile).convert(user=BLINKER)
     #self.print_doctree(BLINKER)
     self._run_blinker()
     self.xmldoc.save_to_file(outfile)
Пример #16
0
    def process_fragments(self):
        """Retrieve the XmlDocument and hand it to the classifier for processing. Processing will
        update this slice when tlinks are added."""

        os.chdir(self.DIR_CLASSIFIER)
        perl = self.tarsqi_instance.getopt_perl()

        ee_model = os.path.join('data', 'op.e-e.model')
        et_model = os.path.join('data', 'op.e-t.model')

        fragment_count = 0

        for fragment in self.fragments:
            base = fragment[0]
            fragment_count += 1

            fin = os.path.join(self.DIR_DATA,
                               base + '.' + self.CREATION_EXTENSION)
            ftmp = os.path.join(self.DIR_DATA, base + '.' + self.TMP_EXTENSION)
            fout = os.path.join(self.DIR_DATA,
                                base + '.' + self.RETRIEVAL_EXTENSION)

            ee_vectors = fin + '.EE'
            et_vectors = fin + '.ET'
            ee_results = ee_vectors + '.REL'
            et_results = et_vectors + '.REL'

            fragment_doc = Parser().parse_file(open(fin, "r"))
            vectors.create_vectors(fragment_doc, ee_vectors, et_vectors)

            print 'done create vectors'

            commands = [
                "./%s -input %s -model %s -output %s > class.log" %
                (self.executable, ee_vectors, ee_model, ee_results),
                "./%s -input %s -model %s -output %s > class.log" %
                (self.executable, et_vectors, et_model, et_results),
                "%s collectClassifier.pl %s %s %s" %
                (perl, ee_vectors, et_vectors, ftmp)
            ]
            for command in commands:
                os.system(command)

            print 'done create features'
            self._add_tlinks_to_fragment(fin, ftmp, fout)
Пример #17
0
    def _add_tlinks_to_fragment(self, in_fragment, tmp_fragment, out_fragment):
        """Takes the links created by the classifier and merges them into the
        input fragment."""

        xmldoc1 = Parser().parse_file(open(in_fragment, 'r'))
        xmldoc2 = Parser().parse_file(open(tmp_fragment, 'r'))

        for tlink in xmldoc2.get_tags(TLINK):
            reltype = tlink.attrs[RELTYPE]
            id1 = tlink.attrs.get(EVENT_INSTANCE_ID, None)
            if not id1:
                id1 = tlink.attrs.get(TIME_ID, None)
            if not id1:
                logger.warn("Could not find id1 in " + tlink.content)
            id2 = tlink.attrs.get(RELATED_TO_EVENT_INSTANCE, None)
            if not id2:
                id2 = tlink.attrs.get(RELATED_TO_TIME, None)
            if not id2:
                logger.warn("Could not find id2 in " + tlink.content)
            origin = CLASSIFIER + ' ' + tlink.attrs.get(CONFIDENCE, '')
            xmldoc1.add_tlink(reltype, id1, id2, origin)

        xmldoc1.save_to_file(out_fragment)
Пример #18
0
 def process(self, infile, outfile):
     """Process a fragment file and write a file with EVENT tags.
     Arguments:
        infile - an absolute path
        outfile - an absolute path"""
     use_old = True
     use_old = False
     if use_old:
         #logger.out('start event parser ', time.time())
         self.doctree = parseFile(infile)
         #logger.out('end event parser   ', time.time())
     else:
         xmldoc = Parser().parse_file(open(infile, 'r'))
         # creating the document tree takes way too long, needs
         # to be optimized
         self.doctree = FragmentConverter(xmldoc, infile).convert()
     #xmldoc.pretty_print()
     #self.print_doctree(EVITA)
     self.extractEvents()
     self.doctree.printOut(outfile)
Пример #19
0
 def process(self, infile, outfile):
     """Run Slinket on the input file and write the results to the output
     file. Both input an doutput file are fragments. Uses the xml
     parser as well as the fragment converter to prepare the input
     and create the shallow tree that Slinket requires.
     Arguments:
        infile - an absolute path
        outfile - an absolute path"""
     use_old = True
     use_old = False
     if use_old:
         self.doctree = eventParser.readFileWithEvents(infile)
     else:
         xmldoc = Parser().parse_file(open(infile, 'r'))
         self.doctree = FragmentConverter(xmldoc,
                                          infile).convert(user=SLINKET)
     #self.print_doctree(SLINKET)
     #logger.debug("Number of sentences in file: " + str(len(self.doctree)))
     for sentence in self.doctree:
         self._find_links(self.doctree, sentence)
     self.doctree.printOut(outfile)
Пример #20
0
    def parse(self, input_file, output_file):
        print ' ================= Parsing ==================='
        input_xml_doc = Parser().parse_file(open(input_file, "r"))

        inside_text = False
        plain_text = ''
        for element in input_xml_doc:
            if inside_text:
                if not element.is_tag() and not element.is_space():
                    original_text += plain_text
            if element.is_opening_tag() and element.tag == 'TEXT':
                inside_text = True
            if element.is_closing_tag() and element.tag == 'TEXT':
                inside_text = False

        result = nlp.parse(plain_text)
        shelf_file = sh.open(output_file)

        for key in result:
            shelf_file[key] = result[key]
        shelf_file.close()
Пример #21
0
def merge_tags(infile1, infile2, merged_file):
    """Merge the tags from infile1, which has all tags from the input,
    with tags from infile2, which has only s, lex and TIMEX3 tags. The
    lex tags are used as the pivots and it is assumed that both files
    contain the same amount of lex tags."""

    # create the document objects and add lex_id values to the lex tags
    doc1 = Parser().parse_file(open(infile1, "r"))
    doc2 = Parser().parse_file(open(infile2, "r"))
    _mark_lex_tags(doc1)
    _mark_lex_tags(doc2)

    # get the timexes and embedded lex tags from infile2, and create
    # index of the lex tags of infile1 using lex_id
    extended_timexes = _get_timextags_with_contained_lextags(doc2)
    lexid_to_lextag = _create_lexid_index(doc1)

    for extended_timex in extended_timexes:

        # get first and last document element of infile1
        timex_tag = extended_timex[0]
        first_lex = extended_timex[1][0]
        last_lex = extended_timex[1][-1]
        first_element = lexid_to_lextag[first_lex]
        last_element = lexid_to_lextag[last_lex].get_closing_tag()

        # get the entire sequence that is to be embedded in the timex tag
        sequence = first_element.get_slice_till(last_element.id)
        sequence_string = ''
        for el in sequence:
            sequence_string = "%s%s" % (sequence_string, el.content)

        # check whether this sequence, when embedded in a tag, results
        # in well-formed XML, if so, add the new timex tag to infile1,
        # otherwise, ignore and print warning
        try:
            Parser().parse_string("<TAG>%s</TAG>" % sequence_string)
            # insert opening and closing timex tags
            first_element.insert_element_before(timex_tag)
            last_element.insert_element_after(
                XmlDocElement('</TIMEX3>', 'TIMEX3'))
        except ExpatError:
            logger.warn("Could not wrap TIMEX3 tag around\n\t %s" %
                        sequence_string)

    # save the Document object of infile1 as the resulting merged file
    doc1.save_to_file(merged_file)
Пример #22
0
    def parse(self, input_file, output_file):
        print ' ================= Parsing ==================='
        input_xml_doc = Parser().parse_file(open(input_file, "r"))

        inside_text = False
        plain_text = ''
        for element in input_xml_doc:
            if inside_text:
                if not element.is_tag() and not element.is_space():
                    plain_text += element.content
            if element.is_opening_tag() and element.tag == 'TEXT':
                inside_text = True
            if element.is_closing_tag() and element.tag == 'TEXT':
                inside_text = False

#         print   plain_text
        plain_text = 'I love to eat.'
        result = self.__nlpParser__.parse(plain_text)
        shelf_file = sh.open(output_file)

        for key in result:
            shelf_file[key] = result[key]
        shelf_file.close()
Пример #23
0
def xml_tree(filename, tab='  ', stack=[]):
    """Takes an xml file, opens it, and creates a string that shows the
    XML tree."""
    file = open(filename, 'r')
    tree_string = ''
    DOC = Parser().parse_file(file)
    for element in DOC.elements:
        if element.is_opening_tag(): stack.append(element)
        if element.is_tag():
            indent = (len(stack) - 1) * tab
        else:
            indent = (len(stack) + 0) * tab
        content_string = element.content
        content_string = content_string.strip()
        if content_string.startswith('<TLINK') or \
                content_string.startswith('<SLINK') or \
                content_string.startswith('<MAKEINSTANCE'):
            content_string = content_string[:-2] + ' />'
        if  content_string != '' and \
                content_string not in ['</TLINK>', '</SLINK>', '</MAKEINSTANCE>']:
            str = "%s %s\n" % (indent, content_string)
            tree_string += str
        if element.is_closing_tag(): stack.pop()
    return tree_string
Пример #24
0
    def _add_tlinks_to_fragment(self, in_fragment, tmp_fragment, out_fragment):

        """Takes the links created by the classifier and merges them into the
        input fragment."""

        xmldoc1 = Parser().parse_file(open(in_fragment,'r'))
        xmldoc2 = Parser().parse_file(open(tmp_fragment,'r'))

        for tlink in xmldoc2.get_tags(TLINK):
            reltype = tlink.attrs[RELTYPE]
            id1 = tlink.attrs.get(EVENT_INSTANCE_ID, None)
            if not id1:
                id1 = tlink.attrs.get(TIME_ID, None)
            if not id1:
                logger.warn("Could not find id1 in " + tlink.content)
            id2 = tlink.attrs.get(RELATED_TO_EVENT_INSTANCE, None)
            if not id2:
                id2 = tlink.attrs.get(RELATED_TO_TIME, None)
            if not id2:
                logger.warn("Could not find id2 in " + tlink.content)
            origin = CLASSIFIER + ' ' + tlink.attrs.get(CONFIDENCE,'')
            xmldoc1.add_tlink(reltype, id1, id2, origin)

        xmldoc1.save_to_file(out_fragment)
Пример #25
0
def incorporate_tlink_with_prior_correction(svm_histogram_file,
                                            no_tlink_directory,
                                            result_directory, tlink_directory,
                                            histogram_class,
                                            correction_method):

    histogram = histogram_class.load_histogram(svm_histogram_file)
    result_files = glob.glob(
        os.path.join(result_directory, '*%s' % RESULT_SUFFIX))
    total_fix_label_counter = 0
    total_worsen_label_counter = 0

    prior = histogram.get_prior()
    """
    Should be converging condition
    """
    all_relation_collect = {}
    for i in xrange(2):
        print i
        logging.info('======================RUN========================')
        logging.info(i)
        for feature_type in prior:
            logging.info(feature_type)
            logging.info(prior[feature_type])

        label_prob_collect = {}
        for result_file in result_files:
            new_relation_collect = correction_method(result_file, prior)
            for feature_type in new_relation_collect:
                if feature_type not in label_prob_collect:
                    label_prob_collect[feature_type] = {}
                label_prob_collect[feature_type][
                    result_file] = new_relation_collect[feature_type]

        all_relation_collect[i] = label_prob_collect

        new_prior = {}
        new_prior_count = defaultdict(int)
        """
        update prior here
        Update prior based on the posterior received 
        from classifying each sample on test data.
        """
        for feature_type in label_prob_collect:
            new_prior[feature_type] = defaultdict(float)
            for result_file in label_prob_collect[feature_type]:
                for line_counter in label_prob_collect[feature_type][
                        result_file]:
                    probability, raw_relType, id_0, id_1 =\
                     label_prob_collect[feature_type][result_file][line_counter]
                    new_prior_count[feature_type] += 1

                    max_value = max(probability.values())
                    #
                    tf = sorted(probability.values())
                    if tf[0] == tf[1]:
                        print probability
                    for label in probability:
                        new_prior[feature_type][label] += probability[label]
            for label in new_prior[feature_type]:
                new_prior[feature_type][label] /= new_prior_count[feature_type]
        """
        Second way of update prior:
        Update prior on the real label assigned to each
        sample each iteration. 
        It's the extremity version of first prior approach,
        by actually assign for each label a posterior of 1 
        for the most likely label.
        """
        #         for feature_type in label_prob_collect:
        #             new_prior[feature_type] = defaultdict(float)
        #             for result_file in label_prob_collect[feature_type]:
        #                 for line_counter in label_prob_collect[feature_type][result_file]:
        #                     probability, raw_relType, id_0, id_1 =\
        #                      label_prob_collect[feature_type][result_file][line_counter]
        #                     new_prior_count[feature_type] += 1
        #                     max_value = max(probability.values())
        #
        #                     tf = sorted(probability.values())
        #                     if tf[0] == tf[-1]:
        #                         new_prior[feature_type][raw_relType] += 1
        #                     else:
        #                         for label in probability:
        #                             if probability[label] == max_value:
        #                                 new_prior[feature_type][label] += 1
        #                                 break
        #             for label in new_prior[feature_type]:
        #                 new_prior[feature_type][label] /=  new_prior_count[feature_type]

        prior = new_prior
    """
    Check this part first
    """
    #     return
    result_file_collect = {}
    for feature_type in label_prob_collect:
        if feature_type in []:
            for result_file in all_relation_collect[0][feature_type]:
                if result_file not in result_file_collect:
                    result_file_collect[result_file] = {}
                result_file_collect[result_file][feature_type] =\
                    all_relation_collect[0][feature_type][result_file]
        else:
            for result_file in label_prob_collect[feature_type]:
                if result_file not in result_file_collect:
                    result_file_collect[result_file] = {}
                result_file_collect[result_file][feature_type] =\
                    label_prob_collect[feature_type][result_file]
    for result_file in result_file_collect:
        rel_filename = result_file[result_file.rindex(os.path.sep) + 1:]
        no_tlink_file = os.path.join(
            no_tlink_directory,
            '%s%s' % (rel_filename[:-len(RESULT_SUFFIX)], NO_TLINK_SUFFIX))
        tlink_file = os.path.join(
            tlink_directory,
            '%s%s' % (rel_filename[:-len(RESULT_SUFFIX)], ADD_TLINK_SUFFIX))

        xml_document = Parser().parse_file(open(no_tlink_file, "r"))
        for feature_type in result_file_collect[result_file]:
            for line_counter in result_file_collect[result_file][feature_type]:
                (probability, raw_relType, id_0, id_1) =\
                     result_file_collect[result_file][feature_type][line_counter]
                tf = sorted(probability.values())
                if tf[0] == tf[-1]:
                    """
                    I should fix the label here to the label
                    guessed by vote dict
                    """
                    relType = raw_relType
                else:
                    if tf[-1] == tf[-2]:
                        print probability
                    relType = sorted(probability.items(),
                                     key=lambda x: x[1])[-1][0]
                if relType != NORELATION:
                    xml_document.add_tlink(relType, id_0, id_1,
                                           SVM_CLASSIFIER_ORIGIN)
        xml_document.save_to_file(tlink_file)
Пример #26
0
def check_abnormal_single(tlink_file):
    xml_document = Parser().parse_file(open(tlink_file, "r"))

    ee_tlinks = []
    """
    It doesn't ensure that any event that
    has a TLINK with the dct time need to be 
    the main event in a sentence, but it's correct
    for the classified tlinks.
    """
    main_events = {}
    for element in xml_document.get_tags(TLINK):
        # keep track of event order here
        if element.is_opening_tag():
            if EVENT_INSTANCE_ID in element.attrs:
                eid = element.attrs[EVENT_INSTANCE_ID]
                if RELATED_TO_EVENT_INSTANCE in element.attrs:
                    reid = element.attrs[RELATED_TO_EVENT_INSTANCE]
                    if RELTYPE in element.attrs:
                        ee_tlinks.append((eid, reid, element.attrs[RELTYPE]))
                if RELATED_TO_TIME in element.attrs:
                    rtid = element.attrs[RELATED_TO_TIME]
                    if RELTYPE in element.attrs:
                        if rtid == 't0':
                            """
                            Reverse the relation and the position of 
                            time and event so as the timeid is always
                            the main entity
                            """
                            main_events[eid] = (reverse(
                                element.attrs[RELTYPE]))
            if TIME_ID in element.attrs:
                tid = element.attrs[TIME_ID]
                if RELATED_TO_EVENT_INSTANCE in element.attrs:
                    reid = element.attrs[RELATED_TO_EVENT_INSTANCE]
                    if RELTYPE in element.attrs:
                        if tid == 't0':
                            main_events[reid] = (element.attrs[RELTYPE])

    total_number = 0
    wrong_number = 0
    for ee_tlink in ee_tlinks:
        if ee_tlink[0] in main_events and ee_tlink[1] in main_events:
            total_number += 1
            """
            They are all main events
            Relations between main events: AFTER, BEFORE and SIMULTANEOUS
            Relations between main event and dct time: AFTER, BEFORE and SIMULTANEOUS
            """
            logging.info("=========================================")
            logging.info(ee_tlink[0])
            logging.info(ee_tlink[1])
            """
            event 0 is main, related to dct time
            """
            relation_1 = reverse(main_events[ee_tlink[0]])
            logging.info(relation_1)
            """
            dct time is main, related to event 1
            """
            relation_2 = main_events[ee_tlink[1]]
            logging.info(relation_2)
            """
            event 0 is main, related to event 1
            """
            relation_3 = ee_tlink[2]
            logging.info(relation_3)
            if relation_3 in LOGIC_COMPOSITION[(relation_1, relation_2)]:
                logging.info("Satisfy constraint")
            else:
                wrong_number += 1
                logging.warn("DOESN'T SATISFY")
    logging.info("Wrong number %d/ Total %d" % (wrong_number, total_number))
    return (wrong_number, total_number)
Пример #27
0
class HtmlGenerator:
    """An HtmlGenerator is created for an XML file with TimeML tags. It is
    used to create HTML files that display the text with events and
    times highlighted and links lined up with the sentences. It can
    also be used to create tables for events, times and links.

    Instance variables:
       file_name - an absolute path
       xmldoc - an XmlDocument
       sentences - a list
       instances - a mapping from eiid and eid to XmlDocElements that contain an instance
       links - a mapping from eids and tids to links that contain them"""
    def __init__(self, file_name):
        self.file_name = file_name
        file = open(file_name, 'r')
        self.xmldoc = Parser().parse_file(file)
        self.sentences = []
        self.instance_idx = {}
        self.event_idx = {}
        self.timex_idx = {}
        self.events = []
        self.timexes = []
        self.links = {}
        self._init_sentence_list()
        self._init_indexes()
        self._init_events_list()
        self._init_timexes_list()
        self._init_link_index()

    def _init_sentence_list(self):
        """Fill in the self.sentences list. Sentences do not include their
        opening and closing tags."""
        for open_s_tag in self.xmldoc.get_tags('s'):
            close_s_tag = open_s_tag.get_closing_tag()
            elements = open_s_tag.get_slice_till(close_s_tag.id)
            self.sentences.append(Sentence(elements))

    def _init_events_list(self):
        for event in self.xmldoc.get_tags('EVENT'):
            eid = event.attrs['eid']
            instance = self.instance_idx[eid]
            self.events.append(Event(event, instance))

    def _init_timexes_list(self):
        self.timexes = self.xmldoc.get_tags('TIMEX3')

    def _init_indexes(self):
        """The value for these indixes are always XML elements."""
        for instance in self.xmldoc.get_tags('MAKEINSTANCE'):
            eiid = instance.attrs.get('eiid')
            eid = instance.attrs.get('eventID')
            self.instance_idx[eiid] = instance
            self.instance_idx[eid] = instance
        for event in self.xmldoc.get_tags('EVENT'):
            eid = event.attrs.get('eid')
            self.event_idx[eid] = event
        for timex in self.xmldoc.get_tags('TIMEX3'):
            tid = timex.attrs.get('tid')
            self.timex_idx[tid] = timex
        # need to add dct, but this needs to be done differently
        self.timex_idx['t0'] = create_dct_element('20080515')

    def _init_link_index(self):
        link_elements = \
            self.xmldoc.get_tags('ALINK') + \
            self.xmldoc.get_tags('SLINK') + \
            self.xmldoc.get_tags('TLINK')
        for link_element in link_elements:
            link = Link(link_element, self.instance_idx)
            id = link.get_id(['eventInstanceID', 'timeID'])
            try:
                self.links[id].append(link)
            except KeyError:
                self.links[id] = [link]

    def create_file(self, outfile, creators):
        """Creates a file with all sentences and the links lined up with the
        sentences."""
        fh = open(outfile, 'w')
        fh.write("<html>\n<head><style>\n" +
                 "s {display: block; text-decoration: none}\n" +
                 "</style>\n<body>\n" + "<table cellpadding=4" +
                 "<tr>\n  <td>source\n  <td>%s\n" % self.file_name +
                 "<tr>\n  <td>components\n  <td>%s\n" % ' + '.join(creators) +
                 "</table>\n\n" + "<hr>\n\n" + "<table cellspacing=7pt>\n")
        for sentence in self.sentences:
            fh.write("\n<tr>\n  <td>")
            sentence.print_html(fh)
            fh.write("  <td>")
            for id in sentence.get_ids():
                links = self.links.get(id, [])
                for link in links:
                    if link.creator in creators:
                        fh.write('   ' + link.convert() + "<br/>\n")
        fh.write("\n</table>\n</body>\n</html>\n")

    def create_events_table(self, file):
        fh = open(file, 'w')
        fh.write("<html>\n<body>\n<table cellpadding=4>\n" +
                 "<tr align=\"left\">\n" +
                 "  <th bgcolor=\"#dddddd\">event\n" +
                 "  <th bgcolor=\"#dddddd\">pos\n" +
                 "  <th bgcolor=\"#dddddd\">class\n" +
                 "  <th bgcolor=\"#dddddd\">tense\n" +
                 "  <th bgcolor=\"#dddddd\">aspect\n" +
                 "  <th bgcolor=\"#dddddd\">polarity\n" +
                 "  <th bgcolor=\"#dddddd\">modality\n" + "</tr>\n")
        celltag = '<td bgcolor="#dddddd">'
        for event in self.events:
            fh.write("<tr>\n")
            eid = event.attrs['eid']
            text = event.attrs['text']
            pos = event.attrs['pos'].lower()
            eclass = event.attrs['class'].lower()
            tense = event.attrs['tense'].lower()
            aspect = event.attrs['aspect'].lower()
            polarity = event.attrs['polarity'].lower()
            modality = event.attrs.get('modality', '').lower()
            if tense == 'none': tense = ''
            if aspect == 'none': aspect = ''
            if polarity == 'pos': polarity = ''
            if aspect == 'perfective_progressive': aspect = 'perf_prog'
            fh.write("  %s<font color=red>%s_%s</font>\n" %
                     (celltag, text, eid))
            fh.write("  %s%s\n" % (celltag, pos))
            fh.write("  %s%s\n" % (celltag, eclass))
            fh.write("  %s%s\n" % (celltag, tense))
            fh.write("  %s%s\n" % (celltag, aspect))
            fh.write("  %s%s\n" % (celltag, polarity))
            fh.write("  %s%s\n" % (celltag, modality))
        fh.write("</table>\n</body>\n<html>\n")

    def create_timexes_table(self, file):
        fh = open(file, 'w')
        fh.write("<html>\n<body>\n<table cellpadding=4>\n" +
                 "<tr align=\"left\">\n" +
                 "  <th bgcolor=\"#dddddd\">timex\n" +
                 "  <th bgcolor=\"#dddddd\">type\n" +
                 "  <th bgcolor=\"#dddddd\">value\n" + "</tr>\n")
        celltag = '<td bgcolor="#dddddd">'
        for timex in self.timexes:
            fh.write("<tr>\n")
            tid = timex.attrs['tid']
            text = timex.collect_content()
            type = timex.attrs['TYPE'].lower()
            value = timex.attrs.get('VAL')
            fh.write("  %s<font color=blue>%s_%s</font>\n" %
                     (celltag, text, tid))
            fh.write("  %s%s\n" % (celltag, type))
            fh.write("  %s%s\n" % (celltag, str(value)))
        fh.write("</table>\n</body>\n<html>\n")

    def create_links_table(self, file):
        fh = open(file, 'w')
        fh.write("<html>\n<body>\n<table cellpadding=4>\n")
        celltag = '<td bgcolor="#dddddd">'
        for sentence in self.sentences:
            for id in sentence.get_ids():
                for link in self.links.get(id, []):
                    id1 = link.get_id1()
                    id2 = link.get_id2()
                    text1 = self._get_text(id1)
                    text2 = self._get_text(id2)
                    full_text1 = color_text(text1, id1)
                    full_text2 = color_text(text2, id2)
                    origin = link.attrs.get('origin')
                    if not origin:
                        origin = link.attrs.get('syntax')
                    fh.write("<tr>\n")
                    fh.write("  %s%s\n" % (celltag, full_text1))
                    fh.write("  %s%s\n" % (celltag, link.attrs['relType']))
                    fh.write("  %s%s\n" % (celltag, full_text2))
                    fh.write("  %s%s\n" % (celltag, origin))
                    fh.write("</tr>\n")
        fh.write("</table>\n</body>\n<html>\n")

    def _get_text(self, id):
        # this is a bit of a hack and needs to be solved elsewhere
        # the DCT has no tag and the cod eto collect data crashes
        if id == 't0':
            return 'DCT'
        if id.startswith('t'):
            timex = self.timex_idx[id]
            return timex.collect_content()
        if id.startswith('e'):
            instance = self.instance_idx[id]
            eid = instance.attrs['eventID']
            event = self.event_idx[eid]
            return event.collect_content()
Пример #28
0
class HtmlGenerator:

    """An HtmlGenerator is created for an XML file with TimeML tags. It is
    used to create HTML files that display the text with events and
    times highlighted and links lined up with the sentences. It can
    also be used to create tables for events, times and links.

    Instance variables:
       file_name - an absolute path
       xmldoc - an XmlDocument
       sentences - a list
       instances - a mapping from eiid and eid to XmlDocElements that contain an instance
       links - a mapping from eids and tids to links that contain them"""

    def __init__(self, file_name):
        self.file_name = file_name
        file = open(file_name,'r')
        self.xmldoc = Parser().parse_file(file)
        self.sentences = []
        self.instance_idx = {}
        self.event_idx = {}
        self.timex_idx = {}
        self.events = []
        self.timexes = []
        self.links = {}
        self._init_sentence_list()
        self._init_indexes()
        self._init_events_list()
        self._init_timexes_list()
        self._init_link_index()

    def _init_sentence_list(self):
        """Fill in the self.sentences list. Sentences do not include their
        opening and closing tags."""
        for open_s_tag in self.xmldoc.get_tags('s'):
            close_s_tag = open_s_tag.get_closing_tag()
            elements = open_s_tag.get_slice_till(close_s_tag.id)
            self.sentences.append(Sentence(elements))

    def _init_events_list(self):
        for event in self.xmldoc.get_tags('EVENT'):
            eid = event.attrs['eid']
            instance = self.instance_idx[eid]
            self.events.append(Event(event, instance))
            
    def _init_timexes_list(self):
        self.timexes = self.xmldoc.get_tags('TIMEX3')
            
    def _init_indexes(self):
        """The value for these indixes are always XML elements."""
        for instance in self.xmldoc.get_tags('MAKEINSTANCE'):
            eiid = instance.attrs.get('eiid')
            eid = instance.attrs.get('eventID')
            self.instance_idx[eiid] = instance
            self.instance_idx[eid] = instance
        for event in self.xmldoc.get_tags('EVENT'):
            eid = event.attrs.get('eid')
            self.event_idx[eid] = event
        for timex in self.xmldoc.get_tags('TIMEX3'):
            tid = timex.attrs.get('tid')
            self.timex_idx[tid] = timex
        # need to add dct, but this needs to be done differently
        self.timex_idx['t0'] = create_dct_element('20080515')

    def _init_link_index(self):
        link_elements = \
            self.xmldoc.get_tags('ALINK') + \
            self.xmldoc.get_tags('SLINK') + \
            self.xmldoc.get_tags('TLINK')
        for link_element in link_elements:
            link = Link(link_element, self.instance_idx)
            id = link.get_id(['eventInstanceID', 'timeID'])
            try:
                self.links[id].append(link)
            except KeyError:
                self.links[id] = [link]
                
    def create_file(self, outfile, creators):
        """Creates a file with all sentences and the links lined up with the
        sentences."""
        fh = open(outfile, 'w')
        fh.write(
            "<html>\n<head><style>\n" + 
            "s {display: block; text-decoration: none}\n" +
            "</style>\n<body>\n" +
            "<table cellpadding=4" +
            "<tr>\n  <td>source\n  <td>%s\n" % self.file_name +
            "<tr>\n  <td>components\n  <td>%s\n" % ' + '.join(creators) +
            "</table>\n\n" +
            "<hr>\n\n" +
            "<table cellspacing=7pt>\n" )
        for sentence in self.sentences:
            fh.write("\n<tr>\n  <td>")
            sentence.print_html(fh)
            fh.write("  <td>")
            for id in sentence.get_ids():
                links = self.links.get(id,[])
                for link in links:
                    if link.creator in creators:
                        fh.write('   ' + link.convert() + "<br/>\n")
        fh.write("\n</table>\n</body>\n</html>\n")
        
    def create_events_table(self, file):
        fh = open(file,'w')
        fh.write("<html>\n<body>\n<table cellpadding=4>\n" +
                 "<tr align=\"left\">\n" +
                 "  <th bgcolor=\"#dddddd\">event\n" +
                 "  <th bgcolor=\"#dddddd\">pos\n" +
                 "  <th bgcolor=\"#dddddd\">class\n" +
                 "  <th bgcolor=\"#dddddd\">tense\n" +
                 "  <th bgcolor=\"#dddddd\">aspect\n" +
                 "  <th bgcolor=\"#dddddd\">polarity\n" +
                 "  <th bgcolor=\"#dddddd\">modality\n" +
                 "</tr>\n")
        celltag = '<td bgcolor="#dddddd">'
        for event in self.events:
            fh.write("<tr>\n")
            eid = event.attrs['eid']
            text = event.attrs['text']
            pos = event.attrs['pos'].lower()
            eclass = event.attrs['class'].lower()
            tense = event.attrs['tense'].lower()
            aspect = event.attrs['aspect'].lower()
            polarity = event.attrs['polarity'].lower()
            modality = event.attrs.get('modality', '').lower()
            if tense == 'none': tense = ''
            if aspect == 'none': aspect = ''
            if polarity == 'pos': polarity = ''
            if aspect == 'perfective_progressive': aspect = 'perf_prog'
            fh.write("  %s<font color=red>%s_%s</font>\n" % (celltag, text, eid))
            fh.write("  %s%s\n" % (celltag, pos))
            fh.write("  %s%s\n" % (celltag, eclass))
            fh.write("  %s%s\n" % (celltag, tense))
            fh.write("  %s%s\n" % (celltag, aspect))
            fh.write("  %s%s\n" % (celltag, polarity))
            fh.write("  %s%s\n" % (celltag, modality))
        fh.write("</table>\n</body>\n<html>\n")

    def create_timexes_table(self, file):
        fh = open(file,'w')
        fh.write("<html>\n<body>\n<table cellpadding=4>\n" +
                 "<tr align=\"left\">\n" +
                 "  <th bgcolor=\"#dddddd\">timex\n" +
                 "  <th bgcolor=\"#dddddd\">type\n" +
                 "  <th bgcolor=\"#dddddd\">value\n" +
                 "</tr>\n")
        celltag = '<td bgcolor="#dddddd">'
        for timex in self.timexes:
            fh.write("<tr>\n")
            tid = timex.attrs['tid']
            text = timex.collect_content()
            type = timex.attrs['TYPE'].lower()
            value = timex.attrs.get('VAL')
            fh.write("  %s<font color=blue>%s_%s</font>\n" % (celltag, text, tid))
            fh.write("  %s%s\n" % (celltag, type))
            fh.write("  %s%s\n" % (celltag, str(value)))
        fh.write("</table>\n</body>\n<html>\n")

    def create_links_table(self, file):
        fh = open(file,'w')
        fh.write("<html>\n<body>\n<table cellpadding=4>\n")
        celltag = '<td bgcolor="#dddddd">'
        for sentence in self.sentences:
            for id in sentence.get_ids():
                for link in self.links.get(id,[]):
                    id1 = link.get_id1()
                    id2 = link.get_id2()
                    text1 = self._get_text(id1)
                    text2 = self._get_text(id2)
                    full_text1 = color_text(text1, id1)
                    full_text2 = color_text(text2, id2)
                    origin = link.attrs.get('origin')
                    if not origin:
                        origin = link.attrs.get('syntax')
                    fh.write("<tr>\n")
                    fh.write("  %s%s\n" % (celltag, full_text1))
                    fh.write("  %s%s\n" % (celltag, link.attrs['relType']))
                    fh.write("  %s%s\n" % (celltag, full_text2))
                    fh.write("  %s%s\n" % (celltag, origin))
                    fh.write("</tr>\n")
        fh.write("</table>\n</body>\n<html>\n")

    def _get_text(self, id):
        # this is a bit of a hack and needs to be solved elsewhere
        # the DCT has no tag and the cod eto collect data crashes
        if id == 't0':
            return 'DCT'
        if id.startswith('t'):
            timex = self.timex_idx[id]
            return timex.collect_content()
        if id.startswith('e'):
            instance = self.instance_idx[id]
            eid = instance.attrs['eventID']
            event = self.event_idx[eid]
            return event.collect_content()
Пример #29
0
def tlink_inject_with_prior_with_check(no_tlink_file, result_file, tlink_file,
                                       original_file):
    """
    """
    """
    Verb event should be a dictionary to map between
    an event id and some other lemmas generated from the 
    initial lemma. 
    """
    verb_events = {}
    """
    EVENT tag sample
    <EVENT class="OCCURRENCE" eid="e1000028">
    """
    xml_document = Parser().parse_file(open(no_tlink_file, "r"))
    xmldoc_original = Parser().parse_file(open(original_file, "r"))

    for element in xml_document.get_tags(EVENT):
        if element.is_opening_tag():
            eid = element.attrs[EID]
            event_content = element.next.content
            synsets_event = None
            if len(wn.synsets(event_content, 'v')) > 0:
                synsets_event = wn.synsets(element.next.content,
                                           'v')[0].lemma_names
            verb_morphy = wn.morphy(event_content, 'v')

            verb_events[eid] = {
                MORPHY_LEMMA: verb_morphy,
                SYNSET_LEMMA: synsets_event
            }
    """
    <MAKEINSTANCE eventID="e2" polarity="POS" pos="VERB" eiid="ei2" 
    tense="PRESENT" aspect="PERFECTIVE">
    """
    verb_event_instance = {}
    for element in xml_document.get_tags(INSTANCE):
        if element.is_opening_tag():
            eiid = element.attrs[EIID]
            eid = element.attrs[EVENTID]
            if eid in verb_events:
                verb_event_instance[eiid] = verb_events[eid]
    """
    All TLINKs in the original document between two events
    Because excepts the TLINKs parts, original and classified 
    documents should be identical, so they could use the same 
    verb_event_instance.
    """
    original_ee_tlinks = {}
    for element in xmldoc_original.get_tags(TLINK):
        # keep track of event order here
        if element.is_opening_tag():
            lid = element.attrs[LID]
            if EVENT_INSTANCE_ID in element.attrs:
                eiid = element.attrs[EVENT_INSTANCE_ID]
                if RELATED_TO_EVENT_INSTANCE in element.attrs:
                    reiid = element.attrs[RELATED_TO_EVENT_INSTANCE]
                    if RELTYPE in element.attrs:
                        if eiid in verb_event_instance and reiid in verb_event_instance:
                            original_ee_tlinks[(eiid, reiid)] = (
                                lid, element.attrs[RELTYPE])

    with open(result_file, 'r') as result_file:
        label_vote_dict = json.load(result_file)

    fix_label_counter = 0
    worsen_label_counter = 0
    for feature_type in label_vote_dict:
        for line_counter in label_vote_dict[feature_type]:
            result_dict = label_vote_dict[feature_type][line_counter][
                RESULT_DICT]
            label_vote = label_vote_dict[feature_type][line_counter][VOTE_DICT]
            ids = label_vote_dict[feature_type][line_counter][TLINK_IDS_DICT]

            raw_relType = label_vote[-1][0]
            """
            Have to re calculate the relType here
            - Calculate P ( label | lemma_pair, result_vector ) ~ P(label) 
                                                x P ( result_vector | label )
                                                x P ( lemma_pair | label )
            """
            #             if raw_relType == NORELATION or raw_relType == SIMULTANEOUS:
            if raw_relType == NORELATION:
                pass
            else:

                def check_event_pair(ids):
                    for id in ids:
                        if id[1] == TID:
                            return False
                    return True

                """
                If the relation is between event pairs, we check 
                the narrative scheme, else, just use the raw_relType
                for TLink between time and event.
                """
                new_ids = {}
                for id in ids:
                    if id[1] in [TID, EIID]:
                        new_ids[id[0]] = id[2]

                original_relation = None
                if (new_ids['0'], new_ids['1']) in original_ee_tlinks:
                    original_relation = original_ee_tlinks[(new_ids['0'],
                                                            new_ids['1'])][1]
                elif (new_ids['1'], new_ids['0']) in original_ee_tlinks:
                    original_relation = reverse(
                        original_ee_tlinks[(new_ids['1'], new_ids['0'])][1])
                """
                Eleventh try
                Only consider main event pairs inter sentences 
                """
                if check_event_pair(ids):
                    probability = {}
                    max_label = None
                    max_prob = None
                    """
                    Third approach: only consider labels inside the votes
                    """
                    result_prob = {}
                    label_prob = {}
                    lemma_pair_prob = {}
                    for label in [str(label[0]) for label in label_vote]:
                        if not label in [BEFORE, AFTER, SIMULTANEOUS]:
                            continue
                        """
                        15th try: only fix BEFORE and AFTER labels
                        """
                        #                         if not label in [BEFORE, AFTER]:
                        #                             continue
                        #                     for label in [BEFORE, AFTER, SIMULTANEOUS]:
                        probability[label] = 1

                        result_prob[label] = histogram.get_probability_vector(
                            result_dict, label)
                        probability[label] *= result_prob[label]
                        """
                        First approach: only use the morphy lemma
                        """
                        morphy_1 = verb_event_instance[
                            new_ids['0']][MORPHY_LEMMA]
                        morphy_2 = verb_event_instance[
                            new_ids['1']][MORPHY_LEMMA]
                        #                         lemma_pair_prob[label] = crd.get_lemma_pair_prob((morphy_1,morphy_2,label))
                        #                         lemma_pair_prob[label] = crd.get_lemma_pair_prob_smoothing((morphy_1,morphy_2),label)
                        """
                        Tenth approach: desperate try, multiply all of them together
                        """
                        #                         probability[label] *= lemma_pair_prob[label]
                        """
                        Done first approach
                        """
                        """
                        Second approach: use all pairs of lemmas with lemma
                        in corresponding two synsets
                        """
                        lemma_pair_prob[label] = 0
                        synset_1 = verb_event_instance[
                            new_ids['0']][SYNSET_LEMMA]
                        synset_2 = verb_event_instance[
                            new_ids['1']][SYNSET_LEMMA]
                        if synset_1 != None and synset_2 != None:
                            for l_1, l_2 in itertools.product(
                                    synset_1, synset_2):
                                lemma_pair_prob[
                                    label] += crd.get_lemma_pair_prob_smoothing(
                                        (l_1, l_2), label)
#                                 lemma_pair_prob[label] += crd.get_lemma_pair_prob((l_1,l_2),label)
                        """
                        Done second approach
                        """
                        """
                        Seventh try: turn off lemma pairs
                        """
                        probability[label] *= lemma_pair_prob[label]

                        label_prob[label] = histogram.get_probability_label(
                            label)

                        #                         """
                        #                         14th try: normalize BEFORE and AFTER labels
                        #                         """
                        #                         if label == BEFORE or label == AFTER:
                        #                             label_prob[label] = (histogram.get_probability_label (BEFORE)
                        #                                                   + histogram.get_probability_label (AFTER))/2
                        """
                        13 rd try: disable label prob
                        """
                        probability[label] *= label_prob[label]

                        if max_prob == None or max_prob < probability[label]:
                            max_prob = probability[label]
                            max_label = label
                    """
                    Forth try:
                    if max_prob == 0, it means that all probabilities = 0
                    and we should follow the initialy vote
                    """
                    if max_prob == 0:
                        relType = raw_relType
                    else:
                        relType = max_label

                    need_to_keep_track = False
                    if (relType == raw_relType and original_relation != None
                            and original_relation != relType
                            and original_relation
                            in [BEFORE, AFTER, SIMULTANEOUS]):
                        need_to_keep_track = True
                        logging.info(
                            '---------------DOESNT HELP----------------')

                    if relType != raw_relType and original_relation != None:
                        need_to_keep_track = True
                        if (original_relation == relType):
                            fix_label_counter += 1
                        if (original_relation == raw_relType):
                            worsen_label_counter += 1
                        logging.info(
                            '---------------MAKE CHANGE----------------')

                    if need_to_keep_track:
                        logging.info('Correct relation : %s' %
                                     original_relation)
                        logging.info('Original classified : %s' % raw_relType)
                        logging.info('Prior classified : %s' % relType)
                        logging.info(morphy_1)
                        logging.info(morphy_2)
                        logging.info(synset_1)
                        logging.info(synset_2)
                        logging.info('--result_prob--')
                        logging.info(result_prob)
                        logging.info('--label_prob--')
                        logging.info(label_prob)
                        logging.info('--lemma_pair_prob--')
                        logging.info(lemma_pair_prob)
                        logging.info(probability)
                        logging.info('==============================')
                else:
                    relType = raw_relType

                xml_document.add_tlink(relType, new_ids['0'], new_ids['1'],
                                       SVM_CLASSIFIER_ORIGIN)

    xml_document.save_to_file(tlink_file)
    return (fix_label_counter, worsen_label_counter)
Пример #30
0
    def _add_tlinks_to_fragment(self, in_fragment, tmp_fragment, out_fragment):

        """Take the links from the merged tlinks and add them into the
        fragment. Based on the method with the same name in the
        classifier wrapper."""

        xmldoc1 = Parser().parse_file(open(in_fragment,'r'))
        xmldoc2 = Parser().parse_file(open(tmp_fragment,'r'))

        xmldoc1.remove_tags(TLINK)
        
        for tlink in xmldoc2.get_tags(TLINK):
            reltype = tlink.attrs[RELTYPE]
            id1 = tlink.attrs.get(EVENT_INSTANCE_ID, None)
            if not id1:
                id1 = tlink.attrs.get(TIME_ID, None)
            if not id1:
                logger.warn("Could not find id1 in " + tlink.content)
            id2 = tlink.attrs.get(RELATED_TO_EVENT_INSTANCE, None)
            if not id2:
                id2 = tlink.attrs.get(RELATED_TO_TIME, None)
            if not id2:
                logger.warn("Could not find id2 in " + tlink.content)
            #origin = CLASSIFIER + ' ' + tlink.attrs.get(CONFIDENCE,'')
            origin = tlink.attrs.get('origin','')
            xmldoc1.add_tlink(reltype, id1, id2, origin)

        xmldoc1.save_to_file(out_fragment)
Пример #31
0
def compare_performance_single(tlink_file, original_file):
    logging.info('================Compare======================')
    logging.info('Classified file: %s' % tlink_file)
    logging.info('Original file: %s' % original_file)
    """
    Compare the performance of a classified file (that is the result of 
    any algorithm or method), toward a destination gold file and the 
    event pair temporal ordering provided by narrative scheme database. 
    
    Only compare the performance between classified files, 
    original files, and temporal ordering database with TLINKs
    that is:
        - TLINKS between two events.
        - Two events need to appear in both original file
        and classified file
        - Two events need to be found in the narrative scheme.
    """
    xmldoc_classified = Parser().parse_file(open(tlink_file, "r"))
    xmldoc_original = Parser().parse_file(open(original_file, "r"))
    """
    Verb event should be a dictionary to map between
    an event id and some other lemmas generated from the 
    initial lemma. 
    """
    verb_events = {}
    """
    EVENT tag sample
    <EVENT class="OCCURRENCE" eid="e1000028">
    """
    for element in xmldoc_classified.get_tags(EVENT):
        if element.is_opening_tag():
            eid = element.attrs[EID]
            event_content = element.next.content
            synsets_event = None
            if len(wn.synsets(event_content, 'v')) > 0:
                synsets_event = wn.synsets(element.next.content,
                                           'v')[0].lemma_names
            verb_morphy = wn.morphy(event_content, 'v')

            verb_events[eid] = {
                MORPHY_LEMMA: verb_morphy,
                SYNSET_LEMMA: synsets_event
            }
    """
    <MAKEINSTANCE eventID="e2" polarity="POS" pos="VERB" eiid="ei2" 
    tense="PRESENT" aspect="PERFECTIVE">
    """
    verb_event_instance = {}
    for element in xmldoc_classified.get_tags(INSTANCE):
        if element.is_opening_tag():
            eiid = element.attrs[EIID]
            eid = element.attrs[EVENTID]
            if eid in verb_events:
                verb_event_instance[eiid] = verb_events[eid]
    """
    All TLINKS in the document that are of two events
    """
    ee_tlinks = []
    for element in xmldoc_classified.get_tags(TLINK):
        # keep track of event order here
        if element.is_opening_tag():
            lid = element.attrs[LID]
            if EVENT_INSTANCE_ID in element.attrs:
                eiid = element.attrs[EVENT_INSTANCE_ID]
                if RELATED_TO_EVENT_INSTANCE in element.attrs:
                    reiid = element.attrs[RELATED_TO_EVENT_INSTANCE]
                    if RELTYPE in element.attrs:
                        if eiid in verb_event_instance and reiid in verb_event_instance:
                            ee_tlinks.append(
                                (eiid, reiid, (lid, element.attrs[RELTYPE])))
    """
    All TLINKs in the original document between two events
    Because excepts the TLINKs parts, original and classified 
    documents should be identical, so they could use the same 
    verb_event_instance.
    """
    original_ee_tlinks = {}
    for element in xmldoc_original.get_tags(TLINK):
        # keep track of event order here
        if element.is_opening_tag():
            lid = element.attrs[LID]
            if EVENT_INSTANCE_ID in element.attrs:
                eiid = element.attrs[EVENT_INSTANCE_ID]
                if RELATED_TO_EVENT_INSTANCE in element.attrs:
                    reiid = element.attrs[RELATED_TO_EVENT_INSTANCE]
                    if RELTYPE in element.attrs:
                        if eiid in verb_event_instance and reiid in verb_event_instance:
                            original_ee_tlinks[(eiid, reiid)] = (
                                lid, element.attrs[RELTYPE])

    no_of_pair_in_database = 0
    no_of_pair = 0
    """
    No of pairs of events that are found in
    classified files, original files and narrative scheme.
    """
    no_of_compare_pairs = 0
    """
    No of pairs of events that are found in
    classified files, original files and narrative scheme
    and that doesn't have matching relation type between
    classified and original files
    """
    no_of_incompatible_pairs = 0

    no_of_verb_events = len(verb_events)

    for eiid, reiid, tlink in ee_tlinks:
        no_of_pair += 1
        """
        Here accessing verb_event_instance given the event instance id
        would give the result to be a dictionary of lemmas.
        """
        lemma_dict_1 = verb_event_instance[eiid]
        lemma_dict_2 = verb_event_instance[reiid]

        logging.info(
            "---------------------------------------------------------------")
        logging.info("---------Classified file---------")
        logging.info("Tlink id in classified file is %s" % tlink[0])
        relType = tlink[1]
        logging.info("Label in classified file is %s" % relType)

        if (eiid, reiid) in original_ee_tlinks:
            no_of_compare_pairs += 1
            original_relation = original_ee_tlinks[(eiid, reiid)][1]
            logging.info("---------Original file---------")
            logging.info("Tlink id in classified file is %s" %
                         original_ee_tlinks[(eiid, reiid)][0])
            logging.info("Label in classified file is %s" % original_relation)
            if relType != original_relation:
                no_of_incompatible_pairs += 1
        elif (reiid, eiid) in original_ee_tlinks:
            no_of_compare_pairs += 1
            original_relation = reverse(original_ee_tlinks[(reiid, eiid)][1])
            logging.info("---------Original file---------")
            logging.info("In the original file, the TLINK is %s" %
                         original_relation)
            logging.info("Tlink in original file %s" %
                         original_ee_tlinks[(reiid, eiid)][0])
            if relType != original_relation:
                no_of_incompatible_pairs += 1
        else:
            continue

        logging.info("---------LEMMA---------")
        if lemma_dict_1[MORPHY_LEMMA] != None and lemma_dict_2[
                MORPHY_LEMMA] != None:
            v_1 = lemma_dict_1[MORPHY_LEMMA]
            v_2 = lemma_dict_2[MORPHY_LEMMA]

            if v_1 == v_2:
                continue
            try:
                result = narrative_checker.check_in_dict(v_1, v_2)
                if result != None:
                    logging.info("%s, %s, %d, %d" %
                                 (v_1, v_2, result[0], result[1]))
            except Exception as e:
                logging.error(str(e))

        logging.info("---------SYNSET---------")
        sum_result = [0, 0]
        if lemma_dict_1[SYNSET_LEMMA] != None and lemma_dict_2[
                SYNSET_LEMMA] != None:
            v_1_list = lemma_dict_1[SYNSET_LEMMA]
            v_2_list = lemma_dict_2[SYNSET_LEMMA]

            for v_1, v_2 in itertools.product(v_1_list, v_2_list):
                if v_1 == v_2:
                    continue
                try:
                    result = narrative_checker.check_in_dict(v_1, v_2)
                    if result != None:
                        sum_result[0] += result[0]
                        sum_result[1] += result[1]
                except Exception as e:
                    logging.error(str(e))
            logging.info(
                "%s\n %s\n %d, %d" %
                (str(v_1_list), str(v_2_list), sum_result[0], sum_result[1]))

        no_of_pair_in_database += 1

    logging.info('Number of events tlink: %d' % no_of_pair)
    logging.info('Number of pairs in database : %d' % no_of_pair_in_database)
    logging.info('Percentage of pairs that are found \
                in narrative scheme database: %.2f' %
                 (float(no_of_pair_in_database) / no_of_pair))
    logging.info('Number of pairs that are found in database\
                and original files as well %d' % no_of_compare_pairs)
    logging.info('Number of pairs that are found in database\
                and original files and not compatible %d' %
                 no_of_incompatible_pairs)
    logging.info(
        '============================================================')
Пример #32
0
class Blinker (TarsqiComponent):

    """Implements the Blinker component of Tarsqi. Blinker takes the
    shallow tree implemented in the Document object and applies rules
    that capture regularities between events and times as well as
    between events.

    Instance variables:
       NAME - a string
       rules - a BlinkerRuleDictionary
       rule2_index - a dictionary, quick access to type 2 rules
       dct - a string of the form YYYYMMDD, representing the document creation time
       xmldoc - an XmlDocument, created by xml_parser.Parser
       doctree - a Document, created by converter.FragmentConverter"""


    def __init__(self):
        """Set component name and load rules into a BlinkerRuleDictionary
        object, this object knows where the rules are stored."""
        self.NAME = BLINKER
        self.rules = BlinkerRuleDictionary()
        self.rule2_index = {}
        #self.rules.pp_ruletype(2)
        self._populate_rule2_index()

    def _populate_rule2_index(self):
        """Rules of type 2 (timex-signal-event) can be simply put in a
        hash keyed on the signals."""
        for rule in self.rules[2]:
            relation = rule.get_attribute('relation')[0]  # vals are now lists
            signal = rule.get_attribute('signal')[0]
            self.rule2_index[signal] = relation

    def process(self, infile, outfile, dct):
        """Apply all Blinker rules to the input file. Parses the xml
        file with xml_parser.Parser and converts it to a shallow tree
        with converter.FragmentConverter. Then applies the Blinker
        rules. Curently only applies rules of type 2.
        Arguments
           infile - an absolute path
           outfile - an absolute path
        No return value."""
        xmlfile = open(infile, "r")
        self.dct = dct
        self.xmldoc = Parser().parse_file(xmlfile)
        self.doctree = FragmentConverter(self.xmldoc, infile).convert(user=BLINKER)
        #self.print_doctree(BLINKER)
        self._run_blinker()
        self.xmldoc.save_to_file(outfile)

    def _run_blinker(self):
        """Apply BLinker rules to the sentences in the doctree
        variable. Currently only deals with rule type 2, anchoring an
        event to a timex in those cases where there is a signal (that
        is, a preposition) available. New Tlinks are added just before
        the closing tag of the fragment."""

        self._run_timex_linking()
        self._apply_event_ordering_with_signal_rules()

        # variables needed for different rule types are prefixed with r<ruleNum>
        r3_event1 = None

        # iterate over sentences
        for si in range(len(self.doctree)):
            sentence = self.doctree[si]
            r3_main_event = None
            if _DEBUG5: print "processing sentence", si

            # iterate over elements within a sentence
            for i in range(len(sentence)):
                element = sentence[i]
                timex = element.get_timex()
                event = element.get_event()
                # RULE TYPE 2 
                if timex:
                    # chunk contains a timex, now try to anchor events to it
                    self._apply_event_anchoring_rules(sentence, timex, i)
                # RULE TYPE 3
                if event and element.isChunk() and element.isVerbChunk():
                    # the first verb event in a sentence is considered the main event
                    if not r3_main_event:
                        r3_main_event = event
                        # if previous sentence contained an event, create a link
                        if r3_event1:
                            r3_event2 = r3_main_event
                            self._apply_type3_rules(r3_event1, r3_event2)
                            r3_event1 = r3_event2
                        # else set event1
                        else:
                            r3_event1 = r3_main_event
                #"""
                # RULE TYPE 5
                if event and element.isChunk() \
                        and element.isVerbChunk() \
                        and event.attrs['class'] == 'REPORTING':
                    if _DEBUG5:
                        print "applying type 5 rules"
                    self._apply_type5_rules(sentence, event, i)
                #"""

            # R3: if no main event in sentence
            if not r3_main_event:
                r3_event1 = None



    def _run_timex_linking(self):

        """Apply the rules that govern relations between TIMEX3 tags. Only
        applies to TIMEX3 tags with a VAL attribute equal to DATE."""

        timexes = [timex for timex in self.xmldoc.get_tags(TIMEX)
                   if timex.attrs['TYPE'] == 'DATE']
        for t in timexes:
            if t.attrs.get('VAL', None) is None:
                logger.warn("Missing VAL: %s" % t.get_content())
                
        for i in range(len(timexes)):
            for j in range(len(timexes)):
                if i < j:
                    try:
                        self._create_timex_link(timexes[i], timexes[j])
                    except Exception:
                        logger.error("Error in Timex Linking:\n%s\n%s" % \
                                     (timexes[i].get_content(),
                                      timexes[j].get_content()))

                        
    def _create_timex_link(self, timex1, timex2):

        """Try to create a TLINK between two timexes."""
        
        creation_year = self.dct[0:4]
        date1 = timex1.attrs.get('VAL', None)
        date2 = timex2.attrs.get('VAL', None)
        if date1 is None or date2 is None:
            return
        date1 = fix_timex_val(date1)
        date2 = fix_timex_val(date2)
        tid1 = timex1.attrs['tid']
        tid2 = timex2.attrs['tid']
        comment = "Blinker - Timex Linking"
        if date1 == date2:
            if date1 not in ('PAST_REF', 'FUTURE_REF'):
                self.xmldoc.add_tlink('IDENTITY', tid1, tid2, comment)
        else:
            rel = compare_date(date1, date2, creation_year)
            if rel != 'IDENTITY':
                self.xmldoc.add_tlink(rel, tid1, tid2, comment)


    def _apply_type3_rules(self, event1, event2):
        """ Creates a TLINK between two main events """
        if _DEBUG3:
            print event1.dtrs[0].getText(), event2.dtrs[0].getText()
            print event1.dtrs[0].getText(), event1.attrs['class'], \
                event1.attrs['tense'], event1.attrs['aspect']
            print event2.dtrs[0].getText(), event2.attrs['class'], \
                event2.attrs['tense'], event2.attrs['aspect']

        for i in range(len(self.rules[3])):
            rule = self.rules[3][i]
            if _DEBUG3:
                print "RULE %s:" % (rule.rule_number)
                print rule.attrs['arg1.class'], rule.attrs['arg1.tense'], rule.attrs['arg1.aspect']
                print rule.attrs['arg2.class'], rule.attrs['arg2.tense'], rule.attrs['arg2.aspect']

            # see tags.py and library.timeMLspec.py for attribute names
            if event1.attrs['class'] in rule.attrs['arg1.class'] and \
               event2.attrs['class'] in rule.attrs['arg2.class'] and \
               event1.attrs['tense'] in rule.attrs['arg1.tense'] and \
               event2.attrs['tense'] in rule.attrs['arg2.tense'] and \
               event1.attrs['aspect'] in rule.attrs['arg1.aspect'] and \
               event2.attrs['aspect'] in rule.attrs['arg2.aspect']:

                rel = rule.attrs['relation'][0]
                self.xmldoc.add_tlink( rel,
                                       event1.attrs[EIID],
                                       event2.attrs[EIID],
                                       "Blinker - Type 3 (rule %s)" % rule.rule_number)
                if _DEBUG3: print "RULE %s fired!" % rule.rule_number
                # apply the first matching rule
                return

    def _apply_type5_rules(self, sentence, event1, position):
        """ Creates TLINKs between the reporting event and reported events

        Takes as arguments sentence, reporting event constituent, and
        position of that constituent within the sentence list"""

        # filter out rules with wrong tense
        applicable_rules = self.rules[5][:]
        applicable_rules = [rule for rule in applicable_rules
                            if event1.attrs['tense'] in rule.attrs['arg1.tense']]

        # reset to opposite when quote is encountered
        direct = 'INDIRECT'

        # forward

        if _DEBUG5:
            print "inside rule application function"
            sentence.pretty_print()
        for i in range(position+1, len(sentence)):
            if _DEBUG5: print "processing element", i
            element = sentence[i]

            # quote
            if element.isToken() and element.getText() in QUOTES:
                if direct == 'DIRECT': direct = 'INDIRECT'
                if direct == 'INDIRECT': direct = 'DIRECT'

            # event 
            event2 = element.get_event()
            if event2 and element.isChunk() and element.isVerbChunk():
                current_rules = applicable_rules[:]
                current_rules = [rule for rule in current_rules if direct in rule.attrs['sentType']]
                if _DEBUG5:
                    print event1.dtrs[0].getText(), event2.dtrs[0].getText()
                    print event1.dtrs[0].getText(), event1.attrs['class'], \
                        event1.attrs['tense'], event1.attrs['aspect']
                    print event2.dtrs[0].getText(), event2.attrs['class'], \
                        event2.attrs['tense'], event2.attrs['aspect']
                for rule in current_rules:
                    # if attribute not set in the rule, accept any value
                    for att in ['class', 'tense', 'aspect']:
                        if not rule.attrs.has_key('arg2.'+att):
                            rule.attrs['arg2.'+att] = [event2.attrs[att]]
                    if _DEBUG5:
                        print "RULE %s (%s):" % (rule.rule_number, rule.attrs['sentType'][0])
                        print rule.attrs['arg1.class'], rule.attrs['arg1.tense'], \
                            rule.attrs['arg1.aspect']
                        print rule.attrs['arg2.class'], rule.attrs['arg2.tense'], \
                            rule.attrs['arg2.aspect']
                    # check that specified values match
                    if event2.attrs['class'] in rule.attrs['arg2.class'] and \
                       event2.attrs['tense'] in rule.attrs['arg2.tense'] and \
                       event2.attrs['aspect'] in rule.attrs['arg2.aspect']:

                        rel = rule.attrs['relation'][0]
                        self.xmldoc.add_tlink( rel,
                                               event1.attrs['eiid'],
                                               event2.attrs['eiid'],
                                               "Blinker - Type 5 (rule %s)" % rule.rule_number)
                        if _DEBUG5: print "RULE %s fired!" % rule.rule_number
                        # apply the first matching rule
                        return
                

        # backward

        # - this creates multiple links for REPORTING to REPORTING
        # - may add the appropriate rules to the rule file instead
        direct = 'INDIRECT'
        for i in range(position-1, -1, -1):   # ..,3,2,1,0
            if _DEBUG5: print "processing element", i
            element = sentence[i]

            # quote
            if element.isToken() and element.getText() in QUOTES:
                if direct == 'DIRECT': direct = 'INDIRECT'
                if direct == 'INDIRECT': direct = 'DIRECT'
                    

            # event 
            event2 = element.get_event()
            if event2 and element.isChunk() and element.isVerbChunk():
                current_rules = applicable_rules[:]
                current_rules = [rule for rule in current_rules if direct in rule.attrs['sentType']]
                if _DEBUG5:
                    print event1.dtrs[0].getText(), event2.dtrs[0].getText()
                    print event1.dtrs[0].getText(), event1.attrs['class'], \
                        event1.attrs['tense'], event1.attrs['aspect']
                    print event2.dtrs[0].getText(), event2.attrs['class'], \
                        event2.attrs['tense'], event2.attrs['aspect']
                    print "Applying rules for sentence type:", direct, len(current_rules), "rules"
                for rule in current_rules:
                    # if attribute not set in the rule, accept any value
                    for att in ['class', 'tense', 'aspect']:
                        if not rule.attrs.has_key('arg2.'+att):
                            rule.attrs['arg2.'+att] = [event2.attrs[att]]
                    if _DEBUG5:
                        print "RULE %s (%s):" % (rule.rule_number, rule.attrs['sentType'][0])
                        print rule.attrs['arg1.class'], rule.attrs['arg1.tense'], \
                            rule.attrs['arg1.aspect']
                        print rule.attrs['arg2.class'], rule.attrs['arg2.tense'], \
                            rule.attrs['arg2.aspect']
                    # check that specified values match
                    if event2.attrs['class'] in rule.attrs['arg2.class'] and \
                       event2.attrs['tense'] in rule.attrs['arg2.tense'] and \
                       event2.attrs['aspect'] in rule.attrs['arg2.aspect']:

                        rel = rule.attrs['relation'][0]
                        self.xmldoc.add_tlink( rel,
                                               event1.attrs['eiid'],
                                               event2.attrs['eiid'],
                                               "Blinker - Type 5 (rule %s)" % rule.rule_number)
                        if _DEBUG5: print "RULE %s fired!" % rule.rule_number
                        # apply the first matching rule
                        return



    def _apply_event_anchoring_rules(self, sentence, timex, i):

        """Anchor events to a given timex that occurs in the sentence
        at index i. The method proceeds by looking for some simple
        syntactic patterns with and without prepositions. If a pattern
        with a preposition occurs, then the preposition is looked up
        in self.rule2_index. If no signal is found, then the default
        INCLUDES rule will apply (rule 1), this is not yet
        implemented."""

        # NOTES:
        # - Need to add some kind of confidence measures

        # PATTERN: [TIMEX EVENT]
        # Or, more precisely, an event in the same chunk as the timex
        # Example: "October elections"
        event = sentence[i].get_event()
        if event:
            eiid = event.attrs[EIID]
            tid = timex.attrs[TID]
            self.xmldoc.add_tlink('IS_INCLUDED', eiid, tid, "Blinker - Type 1")
            return
        
        # Pattern: [CHUNK-WITH-EVENT] Prep [CHUNK-WITH-TIMEX]
        if i > 1:
            event = sentence[i-2].get_event()
            if sentence[i-1].isPreposition() and event:
                signal = sentence[i-1].getText().lower()
                rel = self.rule2_index.get(signal)
                eiid = event.attrs[EIID]
                tid = timex.attrs[TID]
                if _DEBUG2:
                    print "FOUND: [%s] %s [%s] --> %s" % \
                        (event.dtrs[0].getText(), signal, timex.getText(), rel)
                self.xmldoc.add_tlink(rel, eiid, tid, "Blinker - Type 2 (%s)" % signal)
                return
            
        # Pattern: [CHUNK-WITH-VERBAL-EVENT] [CHUNK-WITH_TIMEX]
        if i > 0:
            previous_chunk = sentence[i-1]
            if previous_chunk.isVerbChunk():
                event = previous_chunk.get_event()
                if event:
                    #if event.attrs[POL] != 'NEG':
                    eiid = event.attrs[EIID]
                    tid = timex.attrs[TID]
                    self.xmldoc.add_tlink('IS_INCLUDED', eiid, tid, "Blinker - Type 1a")
                    return
            


    def _apply_event_ordering_with_signal_rules(self):

        """Some more rules without using any rules, basically a placeholder
        for event ordering rules that use a signal."""

        signal_mapping = {
            'after': 'AFTER',
            'before': 'BEFORE',
            'during': 'DURING'
            }
        
        for si in range(len(self.doctree)):
            sentence = self.doctree[si]
            for i in range(len(sentence)):

                try:
                    #print sentence[i:i+4]
                    (VG1, Prep, NG, VG2) = sentence[i:i+4]
                    event1 = VG1.get_event()
                    event2 = VG2.get_event()
                
                    # Pattern: [VG +Event] [Prep] [NG -Event] [VG +Event]

                    if event1 and VG1.isVerbChunk() and \
                            Prep.isPreposition() and \
                            NG.isNounChunk() and not NG.get_event() and \
                            event2 and VG2.isVerbChunk():
                        
                        #print "[VG +Event] [Prep] [NG -Event] [VG +Event]"
                        #print Prep
                        prep_token = Prep.getText().lower()
                        #print prep_token
                        rel = signal_mapping.get(prep_token)
                        #print rel
                        if rel:
                            #print 'adding tlink'
                            eiid1 = event1.attrs[EIID]
                            eiid2 = event2.attrs[EIID]
                            self.xmldoc.add_tlink(rel, eiid1, eiid2, "Blinker - Event:Signal:Event")
                            
                except:
                    pass