Exemplo n.º 1
0
    def apply(self, timex, cur_context, dct, body, before, after):
        """
        Applies this rule to this timex, where body is the full extent covered
        by this timex, before is the preceeding text in the sentence, and after
        is the proceeding text in the sentence, in the [(token, POS), ...] form
        
        A boolean indicating whether or not application was successful is
        returned. The timex may also be modified, so should be passed in by
        reference.
        """

        # Check this rule type matches the timex type
        if self._type != None and timex.type.lower() != self._type.lower():
            return (False, cur_context)

        # Check before, after and whole sentence guards
        if not self._check_guards(self._toks_to_str(before), self._before_guards):
            return (False, cur_context)

        if not self._check_guards(self._toks_to_str(after), self._after_guards):
            return (False, cur_context)

        if not self._check_guards(self._toks_to_str(body), self._guards):
            return (False, cur_context)

        if not self._check_guards(self._toks_to_str(before + body + after), self._sent_guards):
            return (False, cur_context)

        # Now, check if we match:
        if self._tokenise == True:
            senttext = self._toks_to_str(body)
            if self._deliminate_numbers:
                senttext = self._do_deliminate_numbers(senttext)
        else:
            senttext = self._tokenise.join([tok for (tok, pos, ts) in body])

        match = self._match.search(senttext)

        # If we do, then calculate attributes for the timex
        if match:

            if self._DEBUG:
                timex.comment = self.id

            try:
                if self._value_exp != None:
                    timex.value = eval(self._value_exp)

                if self._type_exp != None:
                    timex.type = eval(self._type_exp)

                if self._freq_exp != None:
                    timex.freq = eval(self._freq_exp)

                if self._quant_exp != None:
                    timex.quant = eval(self._quant_exp)

                if self._mod_exp != None:
                    timex.mod = eval(self._mod_exp)

            except Exception as e:
                ternip.warn("Malformed rule expression", e)

            # Need to update current time context, if necessary
            return (True, cur_context)
        else:
            # Rule did not match
            return (False, cur_context)
Exemplo n.º 2
0
#!/usr/bin/env python

from glob import glob
import sys
import os
import os.path

sys.path.append('..')
import ternip
import ternip.formats

if not os.path.isdir('preprocessed'):
    os.mkdir('preprocessed')

for fpath in glob(os.path.normpath('../sample_data/tern/data/english/ace_2004/*/*.sgm')):
    with open(fpath) as fd:
        try:
            doc = ternip.formats.tern(fd.read())
            print "Pre-processing", os.path.basename(fpath)
            doc.reconcile_dct(doc.get_dct_sents(), add_S='s', add_LEX='lex', pos_attr='pos')
            doc.reconcile(doc.get_sents(), add_S='s', add_LEX='lex', pos_attr='pos')
            with open(os.path.join('preprocessed', os.path.basename(fpath)), 'w') as ppfd:
                ppfd.write(str(doc)[22:])
        except Exception as e:
            ternip.warn('Can not load document ' + os.path.basename(fpath), e)

Exemplo n.º 3
0
 def reconcile(self, sents, add_S = False, add_LEX = False, pos_attr=False):
     """
     Reconciles this document against the new internal representation. If
     add_S is set to anything other than False, this means tags are indicated
     to indicate the sentence boundaries, with the tag names being the value
     of add_S. add_LEX is the same, but for marking token boundaries, and
     pos_attr is the name of the attribute which holds the POS tag for that
     token. This is mainly useful for transforming the TERN documents into
     something that GUTime can parse.
     
     If your document already contains S and LEX tags, and add_S/add_LEX is
     set to add them, old S/LEX tags will be stripped first. If pos_attr is
     set and the attribute name differs from the old POS attribute name on
     the lex tag, then the old attribute will be removed.
     
     Sentence/token boundaries will not be altered in the final document
     unless add_S/add_LEX is set. If you have changed the token boundaries in
     the internal representation from the original form, but are not then
     adding them back in, reconciliation may give undefined results.
     
     There are some inputs which would output invalid XML. For example, if
     this document has elements which span multiple sentences, but not whole
     parts of them, then you will be unable to add XML tags and get valid
     XML, so failure will occur in unexpected ways.
     
     If you are adding LEX tags, and your XML document contains tags internal
     to tokens, then reconciliation will fail, as it expects tokens to be in
     a continuous piece of whitespace.
     """
     
     # First, add S tags if need be.
     if add_S:
         
         # First, strip any old ones
         if self._has_S:
             self._strip_tags(self._xml_doc, self._has_S, self._xml_body)
         
         # Then add the new ones
         leftover = self._add_S_tags(self._xml_body, sents, add_S)
         if len(leftover) > 1:
             raise nesting_error('Unable to add all S tags, possibly due to bad tag nesting' + str(leftover))
         
         # Update what we consider to be our S tags
         self._has_S = add_S
     
     # Now, get a list of the S nodes, which are used to reconcile individual
     # tokens
     if self._has_S:
         s_nodes = self._xml_body.getElementsByTagName(self._has_S)
     else:
         # There are no S tokens in the text. So, going forward, only
         # consider there being one sentence, which belongs to the root node
         s_nodes = [self._xml_body]
         new_sent = []
         for sent in sents:
             for part in sent:
                 new_sent.append(part)
         sents = [new_sent]
     
     # Now, add LEX tags if need be
     if add_LEX:
         
         # First, strip any old ones
         if self._has_LEX:
             self._strip_tags(self._xml_doc, self._has_LEX, self._xml_body)
         
         # Now add those LEX tokens
         for i in range(len(sents)):
             self._add_LEX_tags(s_nodes[i], sents[i], add_LEX)
         
         # Update what we consider to be our LEX tags
         self._has_LEX = add_LEX
     
     # Now, add the POS attribute
     if pos_attr and self._has_LEX:
         
         # Get each LEX tag and add the attribute
         for i in range(len(sents)):
             lex_tags = s_nodes[i].getElementsByTagName(self._has_LEX)
             for j in range(len(sents[i])):
                 # Strip the existing attribute if need be
                 try:
                     lex_tags[j].removeAttribute(self._pos_attr)
                 except xml.dom.NotFoundErr:
                     pass
                 
                 # Now set the new POS attr
                 lex_tags[j].setAttribute(pos_attr, sents[i][j][1])
         
         # Update what we think is the pos attr
         self._pos_attr = pos_attr
     
     # Strip old TIMEXes to avoid duplicates
     self.strip_timexes()
     
     # For XML documents, TIMEXes need unique IDs
     all_ts = set()
     for sent in sents:
         for (tok, pos, ts) in sent:
             for t in ts:
                 all_ts.add(t)
     ternip.add_timex_ids(all_ts)
     
     # Now iterate over each sentence
     for i in range(len(sents)):
         
         # Get all timexes in this sentence
         timexes = set()
         for (word, pos, ts) in sents[i]:
             for t in ts:
                 timexes.add(t)
         
         # Now, for each timex, add it to the sentence
         for timex in timexes:
             try:
                 self._add_timex(timex, sents[i], s_nodes[i])
             except nesting_error as e:
                 ternip.warn("Error whilst attempting to add TIMEX", e)
import sys
import os
import os.path

sys.path.append('..')
import ternip
import ternip.formats

if not os.path.isdir('preprocessed'):
    os.mkdir('preprocessed')

for fpath in glob(
        os.path.normpath('../sample_data/tern/data/english/ace_2004/*/*.sgm')):
    with open(fpath) as fd:
        try:
            doc = ternip.formats.tern(fd.read())
            print "Pre-processing", os.path.basename(fpath)
            doc.reconcile_dct(doc.get_dct_sents(),
                              add_S='s',
                              add_LEX='lex',
                              pos_attr='pos')
            doc.reconcile(doc.get_sents(),
                          add_S='s',
                          add_LEX='lex',
                          pos_attr='pos')
            with open(os.path.join('preprocessed', os.path.basename(fpath)),
                      'w') as ppfd:
                ppfd.write(str(doc)[22:])
        except Exception as e:
            ternip.warn('Can not load document ' + os.path.basename(fpath), e)