示例#1
0
 def getEventClass(self):
     try:
         headString = self.head.getText()
     except AttributeError:
         # This is used when the head is None, which can be the
         # case for some weird (and incorrect) chunks, like [to/TO]
         # (MV 11//08/07)
         return None
     # may want to use forms.be (MV 11/08/07)
     if headString in ['was', 'were', 'been']:
         head = 'is'
     else:
         head = DictVerbStems.get(headString, headString.lower())
     # this was indented, which was probably not the idea (MV 11/8/07)
     try:
         if forms.istateprog.match(head): return 'I_STATE'
         elif forms.reportprog.match(head): return 'REPORTING'
         elif forms.percepprog.match(head): return 'PERCEPTION'
         elif forms.iactionprog.match(head): return 'I_ACTION'
         elif forms.aspect1prog.match(head): return 'ASPECTUAL'
         elif forms.aspect2prog.match(head): return 'ASPECTUAL'
         elif forms.aspect3prog.match(head): return 'ASPECTUAL'
         elif forms.aspect4prog.match(head): return 'ASPECTUAL'
         elif forms.aspect5prog.match(head): return 'ASPECTUAL'
         elif forms.stateprog.match(head): return 'STATE'
         else: return 'OCCURRENCE'
     except:
         logger.warn("PROBLEM with noun object again. Verify.")
示例#2
0
    def _printSequence(self, sequence, depth):
        """Given a sentence or a piece of it, print the list of chunks and
        tokens it contains.  'depth' establishes the number of tabs to
        be printed for each item, in order to display it in a
        hierarchical manner.  """

        try:
            for item in sequence:
                if item.nodeType[-14:] == 'AdjectiveToken':
                    logger.debug(depth * "\t" + "ADJ TOKEN: " +
                                 item.getText() + "\t" + item.pos +
                                 "\t\tEvent:" + str(item.event))
                elif item.nodeType[-5:] == 'Token':
                    logger.debug(depth * "\t" + "TOKEN: " + item.getText() +
                                 "\t" + item.pos + "\t\tEvent:" +
                                 str(item.event))
                elif item.nodeType[-5:] == 'Chunk':
                    logger.debug(depth * "\t" + "CHUNK: " + item.nodeType +
                                 "\t\tEvent:" + str(item.event))
                elif item.nodeType == EVENT:
                    logger.debug(depth * "\t" + "EVENT: " + item.text + "\t" +
                                 item.pos)
                elif item.nodeType == TIMEX:
                    logger.debug(depth * "\t" + "TIMEX: " + item.getText())
                else:
                    raise "ERROR: unknown item type: " + item.nodeType
        except:
            logger.warn('Debugging error')
示例#3
0
 def _moderate_dct_vals(self):
     """There are five places where a DCT can be expressed: the DCT handed in
     with the --dct option or defined in the config file, the DCT from the
     metadata on the TarsqiDocument, the DCT from the metadata on the
     SourceDoc, DCTs from the TagRepository on the TarsqiDocument and DCTs
     from the TagRepository on the SourceDoc. The first three are single
     values or None, the other two are lists of any length. The order of
     these five is significant in that a DCT earlier on the list if given
     precedence over a DCT later on the list. Collects all the DCT values and
     picks the very first one, or today's date if no DCTs are available. Logs
     a warning if the DCTs do not all have the same value."""
     dcts = []
     for dct_val in [self.tarsqidoc.options.dct,
                     self.tarsqidoc.metadata.get('dct'),
                     self.tarsqidoc.sourcedoc.metadata.get('dct'),
                     _get_dct_values(self.tarsqidoc.sourcedoc.tags),
                     _get_dct_values(self.tarsqidoc.tags)]:
         if dct_val is None:
             # this is the case where there is no DCT in options or metadata
             continue
         elif isinstance(dct_val, list):
             dcts.extend(dct_val)
         else:
             dcts.append(dct_val)
     if len(set(dcts)) > 1:
         logger.warn("WARNING: more than one DCT value available")
     dct = dcts[0] if dcts else _get_today()
     self.tarsqidoc.metadata['dct'] = dct
示例#4
0
    def process_fragments(self):

        """Set fragment names, create the vectors for each fragment, run the
        classifier and add links from the classifier to the fragments."""

        os.chdir(self.DIR_LINK_MERGER + os.sep + 'sputlink')
        perl = '/usr/local/ActivePerl-5.8/bin/perl'
        perl = 'perl'
        perl = self.tarsqi_instance.getopt_perl()

        for fragment in self.fragments:
            # set fragment names
            base = fragment[0]
            in_fragment = os.path.join(self.DIR_DATA, base+'.'+self.CREATION_EXTENSION)
            tmp_fragment = os.path.join(self.DIR_DATA, base+'.'+self.TMP_EXTENSION)
            out_fragment = os.path.join(self.DIR_DATA, base+'.'+self.RETRIEVAL_EXTENSION)
            # process them
            command = "%s merge.pl %s %s" % (perl, in_fragment, tmp_fragment)
            (i, o, e) = os.popen3(command)
            for line in e:
                if line.lower().startswith('warn'):
                    logger.warn('MERGING: ' + line)
                else:
                    logger.error('MERGING: ' + line)
            for line in o:
                logger.debug('MERGING: ' + line)
            self._add_tlinks_to_fragment(in_fragment, tmp_fragment, out_fragment)
        os.chdir(TTK_ROOT)
示例#5
0
文件: graph.py 项目: mnscholz/ttk
 def _intersect_constraints(self, edge, constraint):
     """Intersect the constraint that was just derived with the one already
     on the edge. There are three cases: (1) the new constraint, if it is the
     one originally handed to the propagate() function, introduces an
     inconsistency; (2) the new constraint is identical to the one already
     there and can be ignored; (3) the intersection of the new constraint
     with the old constraint is the same as the old constraint; and (4) the
     new constraint is more specific than the already existing
     constraint. The method returns False in the first two cases and the
     intersection in the last case."""
     edge = self.edges[constraint.node1][constraint.node2]
     new_relset = constraint.relset
     existing_relset = edge.relset
     intersection = intersect_relations(new_relset, existing_relset)
     debug(2, "INTERSECT NEW {%s} WITH EXISTING {%s} --> {%s}"
           % (constraint.relset, edge.relset, intersection))
     if intersection == '':
         status = 'INCONSISTENT'
         logger.warn("Inconsistent new contraint: %s" % constraint)
         logger.warn("Clashes with: [%s] (derived from %s)"
                     % (edge.constraint, edge.constraint.history_string()))
     elif new_relset == existing_relset:
         status = 'NEW=EXISTING'
     elif intersection == existing_relset:
         status = 'INTERSECTION=EXISTING'
     else:
         status = 'INTERSECTION-IS-MORE-SPECIFIC'
     debug(2, "STATUS: %s" % status)
     return (status, intersection)
示例#6
0
    def _add_tlinks_to_fragment(self, in_fragment, tmp_fragment, out_fragment):

        """Take the links from the merged tlinks and add them into the
        fragment. Based on the method with the same name in the
        classifier wrapper."""

        xmldoc1 = Parser().parse_file(open(in_fragment,'r'))
        xmldoc2 = Parser().parse_file(open(tmp_fragment,'r'))

        xmldoc1.remove_tags(TLINK)
        
        for tlink in xmldoc2.get_tags(TLINK):
            reltype = tlink.attrs[RELTYPE]
            id1 = tlink.attrs.get(EVENT_INSTANCE_ID, None)
            if not id1:
                id1 = tlink.attrs.get(TIME_ID, None)
            if not id1:
                logger.warn("Could not find id1 in " + tlink.content)
            id2 = tlink.attrs.get(RELATED_TO_EVENT_INSTANCE, None)
            if not id2:
                id2 = tlink.attrs.get(RELATED_TO_TIME, None)
            if not id2:
                logger.warn("Could not find id2 in " + tlink.content)
            #origin = CLASSIFIER + ' ' + tlink.attrs.get(CONFIDENCE,'')
            origin = tlink.attrs.get('origin','')
            xmldoc1.add_tlink(reltype, id1, id2, origin)

        xmldoc1.save_to_file(out_fragment)
示例#7
0
文件: tags.py 项目: tarsqi/ttk
 def feature_value(self, name):
     # TODO: can probably use the local attrs dictionary for many of these
     if name == 'eventStatus':
         return '1'
     elif name == 'nodeType':
         return self.__class__.__name__
     elif name in (EVENTID, EIID, CLASS, TENSE, ASPECT, EPOS, STEM):
         return self.tree.events[self.eid][name]
     elif name == MOD:
         return self._get_attribute(name, 'NONE')
     elif name == POL:
         return self._get_attribute(name, 'POS')
     elif name in ('text', FORM):
         if self.tree.events.has_key(self.eid):
             return self.tree.events[self.eid][FORM]
         else:
             logger.warn("Event %s is not stored in the events on the TarsqiTree" % self)
             return ' '.join([t.text for t in get_tokens(self)])
     elif name == POS:
         try:
             return self.tree.events[self.eid][POS]
         except:
             # I don't remember whether POS has a particular use here
             # or is a left over from prior times
             logger.warn("Returning 'epos' instead of 'pos' value")
             return self.tree.events[self.eid][EPOS]
     else:
         raise AttributeError, name
示例#8
0
文件: chunks.py 项目: jasonzou/TARSQI
 def createEvent(self):
     logger.debug("createEvent in VerbChunk")
     #self.pretty_print()
     GramVChList = self.gramChunk()
     # do not attempt to create an event if there are no true
     # chunks in there
     true_chunks = GramVChList.trueChunkLists
     if len(true_chunks) == 1 and not true_chunks[0]:
         return
     # also skip if there is no content at all
     if len(GramVChList) == 0:
         logger.warn("Obtaining an empty GramVChList")
     # simple case
     elif len(GramVChList) == 1:
         logger.debug("len(GramVChList) == 1")
         self._createEventOnRightmostVerb(GramVChList[-1])
     # complex case
     else:
         logger.debug("len(GramVChList) > 1:" + str(len(GramVChList)))
         lastIdx = len(GramVChList)-1
         for idx in range(len(GramVChList)):
             gramVCh = GramVChList[idx]
             if idx == lastIdx:
                 self._createEventOnRightmostVerb(gramVCh)
             else:
                 logger.debug("[Not Last] " + gramVCh.as_extended_string())
                 if not gramVCh.isAuxVerb():
                     self._processEventInChunk(gramVCh)
示例#9
0
文件: chunks.py 项目: jasonzou/TARSQI
 def createEvent(self):
     logger.debug("createEvent in VerbChunk")
     #self.pretty_print()
     GramVChList = self.gramChunk()
     # do not attempt to create an event if there are no true
     # chunks in there
     true_chunks = GramVChList.trueChunkLists
     if len(true_chunks) == 1 and not true_chunks[0]:
         return
     # also skip if there is no content at all
     if len(GramVChList) == 0:
         logger.warn("Obtaining an empty GramVChList")
     # simple case
     elif len(GramVChList) == 1:
         logger.debug("len(GramVChList) == 1")
         self._createEventOnRightmostVerb(GramVChList[-1])
     # complex case
     else:
         logger.debug("len(GramVChList) > 1:" + str(len(GramVChList)))
         lastIdx = len(GramVChList) - 1
         for idx in range(len(GramVChList)):
             gramVCh = GramVChList[idx]
             if idx == lastIdx:
                 self._createEventOnRightmostVerb(gramVCh)
             else:
                 logger.debug("[Not Last] " + gramVCh.as_extended_string())
                 if not gramVCh.isAuxVerb():
                     self._processEventInChunk(gramVCh)
示例#10
0
文件: graph.py 项目: jasonzou/ttk
 def _intersect_constraints(self, edge, constraint):
     """Intersect the constraint that was just derived with the one already
     on the edge. There are three cases: (1) the new constraint, if it is the
     one originally handed to the propagate() function, introduces an
     inconsistency; (2) the new constraint is identical to the one already
     there and can be ignored; (3) the intersection of the new constraint
     with the old constraint is the same as the old constraint; and (4) the
     new constraint is more specific than the already existing
     constraint. The method returns False in the first two cases and the
     intersection in the last case."""
     edge = self.edges[constraint.node1][constraint.node2]
     new_relset = constraint.relset
     existing_relset = edge.relset
     intersection = intersect_relations(new_relset, existing_relset)
     debug(
         2, "INTERSECT NEW {%s} WITH EXISTING {%s} --> {%s}" %
         (constraint.relset, edge.relset, intersection))
     if intersection == '':
         status = 'INCONSISTENT'
         logger.warn("Inconsistent new contraint: %s" % constraint)
         logger.warn("Clashes with: [%s] (derived from %s)" %
                     (edge.constraint, edge.constraint.history_string()))
     elif new_relset == existing_relset:
         status = 'NEW=EXISTING'
     elif intersection == existing_relset:
         status = 'INTERSECTION=EXISTING'
     else:
         status = 'INTERSECTION-IS-MORE-SPECIFIC'
     debug(2, "STATUS: %s" % status)
     return (status, intersection)
示例#11
0
文件: tree.py 项目: jasonzou/ttk
 def addEvent(self, event):
     """Takes an instance of evita.event.Event and adds it to the
     TagRepository on the TarsqiDocument. Does not add it if there is already
     an event at the same location."""
     # NOTE: we now always have one token on this list, if there are more in
     # a future implementation we takes the last, but what probably should
     # happen is that we take the begin offset from the first and the end
     # offset from the last token.
     token = event.tokens[-1]
     if self.tarsqidoc.has_event(token.begin, token.end):
         logger.warn("There already is an event at that location.")
     else:
         event_attrs = dict(event.attrs)
         # with the current implementation, there is always one instance per
         # event, so we just reuse the event identifier for the instance
         eid = self.tarsqidoc.next_event_id()
         eiid = "ei%s" % eid[1:]
         event_attrs['eid'] = eid
         event_attrs['eiid'] = eiid
         # TODO: at least the second test does not seem needed anymore
         event_attrs = {
             k: v
             for k, v in event_attrs.items()
             if v is not None and k is not 'eventID'
         }
         self.tarsqidoc.add_event(token.begin, token.end, event_attrs)
示例#12
0
文件: tree.py 项目: jasonzou/ttk
 def insert(self, tag):
     """Insert a Tag in the node. This could be insertion in one of the node's
     daughters, or insertion in the node's daughters list. Log a warning if
     the tag cannot be inserted."""
     # first check if tag offsets fit in self offsets
     if tag.begin < self.begin or tag.end > self.end:
         pass
     # add tag as first daughter if there are no daughters
     elif not self.dtrs:
         self.dtrs.append(Node(tag, self, self.tree))
     else:
         # find the index of the daughter that the tag would fit in and
         # insert the tag into the daughter
         idx = self._find_dtr_idx(tag)
         if idx is not None:
             self._insert_tag_into_dtr(tag, idx)
         else:
             # otherwise, find the insert point for the tag and insert it in
             # the dtrs list
             dtrs_idx = self._find_gap_idx(tag)
             if dtrs_idx is not None:
                 self.dtrs.insert(dtrs_idx, Node(tag, self, self.tree))
             else:
                 # otherwise, find the span of dtrs that the tag includes,
                 # replace the span with the tag and insert the span into the
                 # tag
                 span = self._find_span_idx(tag)
                 if span:
                     self._replace_span_with_tag(tag, span)
                 else:
                     # log warning if the tag cannot be inserted
                     # TODO: maybe downgrade to debug statement
                     logger.warn("Cannot insert %s" % tag)
                     raise NodeInsertionError
示例#13
0
文件: wrapper.py 项目: tarsqi/ttk
 def _export(self, text):
     """Export preprocessing information to the tag repository. Updates the
     TagRepository using the preprocessing result."""
     ctag = None
     for sentence in text:
         sentence_attrs = { 'id': TagId.next('s'), 'origin': PREPROCESSOR }
         stag = Tag('s', None, None, sentence_attrs)
         for token in sentence:
             if self._is_tag(token):
                 if not token.startswith('</'):
                     ctag = Tag(token[1:-1], None, None,
                                { 'id': TagId.next('c'), 'origin': PREPROCESSOR })
                 else:
                     ctag.end = last_ltag.end
                     self.document.tags.append(ctag)
                     ctag = None
             elif type(token) == TupleType:
                 ltag = self._make_ltag(token)
                 self.document.tags.append(ltag)
                 if stag.begin is None:
                     stag.begin = token[3]
                 if ctag is not None and ctag.begin is None:
                     ctag.begin = ltag.begin
                 last_end_offset = token[4]
                 last_ltag = ltag
             else:
                 logger.warn('Unexpected token type')
         stag.end = last_ltag.end
         self.document.tags.append(stag)
         # this indexing is needed because we bypassed the add_tag method on
         # TagRepository and instead directly appended to the tags list
         self.document.tags.index()
示例#14
0
 def _export(self, text):
     """Export preprocessing information to the tag repository. Updates the
     TagRepository using the preprocessing result."""
     ctag = None
     for sentence in text:
         sentence_attrs = {'id': TagId.next('s'), 'origin': PREPROCESSOR}
         stag = Tag('s', None, None, sentence_attrs)
         for token in sentence:
             if self._is_tag(token):
                 if not token.startswith('</'):
                     ctag = Tag(token[1:-1], None, None, {
                         'id': TagId.next('c'),
                         'origin': PREPROCESSOR
                     })
                 else:
                     ctag.end = last_ltag.end
                     self.document.tags.append(ctag)
                     ctag = None
             elif type(token) == tuple:
                 ltag = self._make_ltag(token)
                 self.document.tags.append(ltag)
                 if stag.begin is None:
                     stag.begin = token[3]
                 if ctag is not None and ctag.begin is None:
                     ctag.begin = ltag.begin
                 last_end_offset = token[4]
                 last_ltag = ltag
             else:
                 logger.warn('Unexpected token type')
         stag.end = last_ltag.end
         self.document.tags.append(stag)
         # this indexing is needed because we bypassed the add_tag method on
         # TagRepository and instead directly appended to the tags list
         self.document.tags.index()
示例#15
0
文件: chunks.py 项目: jasonzou/ttk
 def createEvent(self, verbfeatures=None, imported_events=None):
     """Try to create an event in the NounChunk. Checks whether the nominal
     is an event candidate, then conditionally adds it. The verbfeatures
     dictionary is used when a governing verb hands in its features to a
     nominal in a predicative complement. The imported_events is handed in
     when Tarsqi tries to import events from a previous annotation."""
     logger.debug("NounChunk.createEvent(verbfeatures=%s)" % verbfeatures)
     if self.isEmpty():
         # this happened at some point due to a crazy bug in some old code
         # that does not exist anymore, log a warning in case this returns
         logger.warn("There are no dtrs in the NounChunk")
     else:
         self.features = NChunkFeatures(self, verbfeatures)
         logger.debug(self.features.as_verbose_string())
         # don't bother if the head already is an event
         if self.features.head.isEvent():
             logger.debug("Nominal already contains an event")
         # Even if preceded by a BE or a HAVE form, only tagging NounChunks
         # headed by an eventive noun, so "was an intern" will NOT be tagged
         elif self._passes_syntax_test():
             imported_event = self._get_imported_event_for_chunk(
                 imported_events)
             #print imported_event
             if imported_event is not None:
                 self._conditionally_add_imported_event(imported_event)
             elif self._passes_semantics_test():
                 self._conditionallyAddEvent()
示例#16
0
文件: features.py 项目: tarsqi/ttk
 def _addInCurrentSublist(self, sublist, element):
     """Add the element to the current element (that is, the last element) in
     sublist. The elements of the sublist are lists themselves."""
     if len(sublist) - self.counter == 1:
         sublist[self.counter].append(element)
     else:
         logger.warn("length of chunk list and counter are out of sync")
示例#17
0
文件: features.py 项目: tarsqi/ttk
 def getEventClass(self):
     """Return the event class for the nominal, using the regelar expressions
     in the library."""
     try:
         text = self.head.getText()
     except AttributeError:
         # This is used when the head is None, which can be the case for some
         # weird (and incorrect) chunks, like [to/TO]
         # TODO: make sure this cannot happen
         logger.warn("Cannot assign class to incorrect chunk")
         return None
     stem = 'is' if text in forms.be else DictVerbStems.get(text, text.lower())
     try:
         if forms.istateprog.match(stem): return  'I_STATE'
         elif forms.reportprog.match(stem): return 'REPORTING'
         elif forms.percepprog.match(stem): return 'PERCEPTION'
         elif forms.iactionprog.match(stem): return 'I_ACTION'
         elif forms.aspect1prog.match(stem): return 'ASPECTUAL'
         elif forms.aspect2prog.match(stem): return 'ASPECTUAL'
         elif forms.aspect3prog.match(stem): return 'ASPECTUAL'
         elif forms.aspect4prog.match(stem): return 'ASPECTUAL'
         elif forms.aspect5prog.match(stem): return 'ASPECTUAL'
         elif forms.stateprog.match(stem): return 'STATE'
         else: return 'OCCURRENCE'
     except:
         logger.warn("Error running event class patterns")
示例#18
0
 def __getitem__(self, index):
     """Get an item from the dtrs variable."""
     if index is None:
         logger.warn("Given index to __getitem__ in Sentence is None")
         return None
     else:
         return self.dtrs[index]
示例#19
0
 def getEventClass(self):
     """Return the event class for the nominal, using the regeluar expressions
     in the library."""
     try:
         text = self.head.getText()
     except AttributeError:
         # This is used when the head is None, which can be the case for some
         # weird (and incorrect) chunks, like [to/TO] (MV 11//08/07)
         logger.warn("Cannot assign class to incorrect chunk")
         return None
     stem = 'is' if text in forms.be else DictVerbStems.get(
         text, text.lower())
     try:
         if forms.istateprog.match(stem): return 'I_STATE'
         elif forms.reportprog.match(stem): return 'REPORTING'
         elif forms.percepprog.match(stem): return 'PERCEPTION'
         elif forms.iactionprog.match(stem): return 'I_ACTION'
         elif forms.aspect1prog.match(stem): return 'ASPECTUAL'
         elif forms.aspect2prog.match(stem): return 'ASPECTUAL'
         elif forms.aspect3prog.match(stem): return 'ASPECTUAL'
         elif forms.aspect4prog.match(stem): return 'ASPECTUAL'
         elif forms.aspect5prog.match(stem): return 'ASPECTUAL'
         elif forms.stateprog.match(stem): return 'STATE'
         else: return 'OCCURRENCE'
     except:
         logger.warn("Error running event class patterns")
示例#20
0
文件: chunks.py 项目: tarsqi/ttk
 def createEvent(self, verbfeatures=None, imported_events=None):
     """Try to create an event in the NounChunk. Checks whether the nominal
     is an event candidate, then conditionally adds it. The verbfeatures
     dictionary is used when a governing verb hands in its features to a
     nominal in a predicative complement. The imported_events is handed in
     when Tarsqi tries to import events from a previous annotation."""
     logger.debug("NounChunk.createEvent(verbfeatures=%s)" % verbfeatures)
     if self.isEmpty():
         # this happened at some point due to a crazy bug in some old code
         # that does not exist anymore, log a warning in case this returns
         logger.warn("There are no dtrs in the NounChunk")
     else:
         self.features = NChunkFeatures(self, verbfeatures)
         logger.debug(self.features.as_verbose_string())
         # don't bother if the head already is an event
         if self.features.head.isEvent():
             logger.debug("Nominal already contains an event")
         # Even if preceded by a BE or a HAVE form, only tagging NounChunks
         # headed by an eventive noun, so "was an intern" will NOT be tagged
         elif self._passes_syntax_test():
             imported_event = self._get_imported_event_for_chunk(imported_events)
             #print imported_event
             if imported_event is not None:
                 self._conditionally_add_imported_event(imported_event)
             elif self._passes_semantics_test():
                 self._conditionallyAddEvent()
示例#21
0
 def _addInCurrentSublist(self, sublist, element):
     """Add the element to the current element (that is, the last element) in
     sublist. The elements of the sublist are lists themselves."""
     if len(sublist) - self.counter == 1:
         sublist[self.counter].append(element)
     else:
         logger.warn("length of chunk list and counter are out of sync")
示例#22
0
文件: tags.py 项目: jasonzou/ttk
 def feature_value(self, name):
     # TODO: can probably use the local attrs dictionary for many of these
     if name == 'eventStatus':
         return '1'
     elif name == 'nodeType':
         return self.__class__.__name__
     elif name in (EVENTID, EIID, CLASS, TENSE, ASPECT, EPOS, STEM):
         return self.tree.events[self.eid][name]
     elif name == MOD:
         return self._get_attribute(name, 'NONE')
     elif name == POL:
         return self._get_attribute(name, 'POS')
     elif name in ('text', FORM):
         if self.tree.events.has_key(self.eid):
             return self.tree.events[self.eid][FORM]
         else:
             logger.warn(
                 "Event %s is not stored in the events on the TarsqiTree" %
                 self)
             return ' '.join([t.text for t in get_tokens(self)])
     elif name == POS:
         try:
             return self.tree.events[self.eid][POS]
         except:
             # I don't remember whether POS has a particular use here
             # or is a left over from prior times
             logger.warn("Returning 'epos' instead of 'pos' value")
             return self.tree.events[self.eid][EPOS]
     else:
         raise AttributeError, name
示例#23
0
文件: chunker.py 项目: tarsqi/ttk
 def _consume_term(self, term, idx):
     """Now that we now that a term starts at index idx, read the whole term
     and, if it matches a few requirements, add it to the chunk_tags
     dictionary. A term is an instance of docmodel.document.Tag."""
     begin_idx = idx
     end_idx = -1
     tag = self.sentence[idx]
     while term.begin <= tag[3] < term.end:
         end_idx = idx
         idx += 1
         if idx >= len(self.sentence):
             break
         tag = self.sentence[idx]
     final_tag = self.sentence[idx-1]
     if (end_idx > -1) and (final_tag[4] == term.end):
         # constituent found, set tags and return index after end
         pos = final_tag[1]
         if pos.startswith('V'):
             chunk_type = VG
         elif pos.startswith('N'):
             chunk_type = NG
         else:
             # do not create a chunk if this was not headed by a noun or verb
             return begin_idx
         self._set_tags(chunk_type, begin_idx, end_idx)
         return end_idx + 1
     else:
         # none found, return the initial index, this should actually not
         # happen so log a warning
         logger.warn("Could not consume full term")
         return begin_idx
示例#24
0
 def _consume_term(self, term, idx):
     """Now that we now that a term starts at index idx, read the whole term
     and, if it matches a few requirements, add it to the chunk_tags
     dictionary. A term is an instance of docmodel.document.Tag."""
     begin_idx = idx
     end_idx = -1
     tag = self.sentence[idx]
     while term.begin <= tag[3] < term.end:
         end_idx = idx
         idx += 1
         if idx >= len(self.sentence):
             break
         tag = self.sentence[idx]
     final_tag = self.sentence[idx - 1]
     if (end_idx > -1) and (final_tag[4] == term.end):
         # constituent found, set tags and return index after end
         pos = final_tag[1]
         if pos.startswith('V'):
             chunk_type = VG
         elif pos.startswith('N'):
             chunk_type = NG
         else:
             # do not create a chunk if this was not headed by a noun or verb
             return begin_idx
         self._set_tags(chunk_type, begin_idx, end_idx)
         return end_idx + 1
     else:
         # none found, return the initial index, this should actually not
         # happen so log a warning
         logger.warn("Could not consume full term")
         return begin_idx
示例#25
0
 def process_fragments(self):
     """Calls, for each fragment, the Perl scripts that implement GUTime
     and merges the results back into the fragment."""
     os.chdir(self.DIR_GUTIME)
     self.dct = self.tarsqi_instance.metadata['dct']
     for fragment in self.fragments:
         # set fragment names
         base = fragment[0]
         in_file = "%s%s%s.%s" % (self.DIR_DATA, os.sep, base,
                                  self.CREATION_EXTENSION)
         tmp1_file = "%s%s%s.%s" % (self.DIR_DATA, os.sep, base,
                                    self.TMP1_EXTENSION)
         tmp2_file = "%s%s%s.%s" % (self.DIR_DATA, os.sep, base,
                                    self.TMP2_EXTENSION)
         out_file = "%s%s%s.%s" % (self.DIR_DATA, os.sep, base,
                                   self.RETRIEVAL_EXTENSION)
         print '-shit come here---------------------------------'
         print in_file
         print tmp1_file
         print tmp2_file
         print out_file
         # process them
         command = "perl gutime.pl -dct %s -t fragment %s %s" % (
             self.dct, in_file, tmp1_file)
         (fh_in, fh_out, fh_errors) = os.popen3(command)
         for line in fh_errors:
             logger.warn(line)
         merge_tags(in_file, tmp1_file, tmp2_file)
         self.btime.process(tmp2_file, out_file)
     os.chdir(TTK_ROOT)
示例#26
0
 def _moderate_dct_vals(self):
     """There are five places where a DCT can be expressed: the DCT handed in
     with the --dct option or defined in the config file, the DCT from the
     metadata on the TarsqiDocument, the DCT from the metadata on the
     SourceDoc, DCTs from the TagRepository on the TarsqiDocument and DCTs
     from the TagRepository on the SourceDoc. The first three are single
     values or None, the other two are lists of any length. The order of
     these five is significant in that a DCT earlier on the list if given
     precedence over a DCT later on the list. Collects all the DCT values and
     picks the very first one, or today's date if no DCTs are available. Logs
     a warning if the DCTs do not all have the same value."""
     dcts = []
     for dct_val in [self.tarsqidoc.options.dct,
                     self.tarsqidoc.metadata.get('dct'),
                     self.tarsqidoc.sourcedoc.metadata.get('dct'),
                     _get_dct_values(self.tarsqidoc.sourcedoc.tags),
                     _get_dct_values(self.tarsqidoc.tags)]:
         if dct_val is None:
             # this is the case where there is no DCT in options or metadata
             continue
         elif isinstance(dct_val, list):
             dcts.extend(dct_val)
         else:
             dcts.append(dct_val)
     if len(set(dcts)) > 1:
         logger.warn("WARNING: more than one DCT value available")
     dct = dcts[0] if dcts else _get_today()
     self.tarsqidoc.metadata['dct'] = dct
示例#27
0
文件: features.py 项目: tarsqi/ttk
 def getHead(self):
     """Return the head, which is the last element of the core in
     self.trueChunk, return None if there is no such core."""
     if self.trueChunk:
         return self.trueChunk[-1]
     else:
         logger.warn("empty trueChunk, head is set to None")
         return None
示例#28
0
def procEventStart(attrs):
    global currentEvent
    #print "Current Timex:", currentTimex, "||", "Current Event:", currentEvent
    if currentTimex is not None or currentEvent is not None:
        logger.warn("<EVENT> within <TIMEX3> or another <EVENT> tag")
        currentSentence.trackEmbedding(EVENT)
    else:
        currentEvent = EventTag(attrs)
示例#29
0
def procEventStart(attrs):
    global currentEvent
    # print "Current Timex:", currentTimex, "||", "Current Event:", currentEvent
    if currentTimex is not None or currentEvent is not None:
        logger.warn("<EVENT> within <TIMEX3> or another <EVENT> tag")
        currentSentence.trackEmbedding(EVENT)
    else:
        currentEvent = EventTag(attrs)
示例#30
0
 def getHead(self):
     """Return the head of the GramVChunk, which is the last element of the
     core in self.trueChunk, return None if there is no such core."""
     if self.trueChunk:
         return self.trueChunk[-1]
     else:
         logger.warn("empty trueChunk, head is set to None")
         return None
示例#31
0
    def _get_id(self, prefix, attrs, line):
        """Get the eiid or tid for the first or second object in the
        vector. The prefix is '0' or '1' and determines which object's
        id is returned."""

        id = attrs.get(prefix + EIID, attrs.get(prefix + TID, None))
        if not id:
            logger.warn("Could not find id in " + line)
        return id
示例#32
0
 def _get_doc_source(self, xmldoc):
     """Returns the name of the content provider."""
     tag_DOCNO = xmldoc.tags['DOCNO'][0]
     content = tag_DOCNO.collect_content().strip()
     # TimeBank has only these providers
     for str in ('ABC', 'APW', 'AP', 'CNN', 'NYT', 'PRI', 'SJMN', 'VOA', 'WSJ', 'ea', 'ed'):
         if content.startswith(str):
             return str
     logger.warn("Could not determine document source from DOCNO tag")
     return 'GENERIC'
示例#33
0
 def _get_tag_content(self, tagname):
     """Return the text content of the first tag with name tagname, return
     None if there is no such tag."""
     try:
         tag = self._get_source().tags.find_tags(tagname)[0]
         content = self._get_source().text[tag.begin:tag.end].strip()
         return content
     except IndexError:
         logger.warn("Cannot get the %s tag in this document" % tagname)
         return None
示例#34
0
文件: tokens.py 项目: mnscholz/ttk
 def createAdjEvent(self, gramvchunk=None):
     """Processes the adjective after a copular verb and make it an event if the
     adjective has an event class."""
     logger.debug("AdjectiveToken.createAdjEvent(gramvchunk)")
     if not self.parent.__class__.__name__ == 'Sentence':
         logger.warn("Unexpected syntax tree")
         return
     self.gramchunk = GramAChunk(self, gramvchunk)
     logger.debug(self.gramchunk.as_verbose_string())
     self._conditionallyAddEvent()
示例#35
0
 def _get_tag_content(self, tagname):
     """Return the text content of the first tag with name tagname, return
     None if there is no such tag."""
     try:
         tag = self.get_source().tags.find_tags(tagname)[0]
         content = self.get_source().text[tag.begin:tag.end].strip()
         return content
     except IndexError:
         logger.warn("Cannot get the %s tag in this document" % tagname)
         return None
示例#36
0
文件: model.py 项目: jasonzou/TARSQI
 def _get_doc_source(self, xmldoc):
     """Returns the name of the content provider."""
     tag_DOCNO = xmldoc.tags["DOCNO"][0]
     content = tag_DOCNO.collect_content().strip()
     # TimeBank has only these providers
     for str in ("ABC", "APW", "AP", "CNN", "NYT", "PRI", "SJMN", "VOA", "WSJ", "ea", "ed"):
         if content.startswith(str):
             return str
     logger.warn("Could not determine document source from DOCNO tag")
     return "GENERIC"
示例#37
0
 def createAdjEvent(self, verbfeatures=None):
     """Processes the adjective after a copular verb and make it an event if the
     adjective has an event class."""
     logger.debug("AdjectiveToken.createAdjEvent(verbfeatures)")
     if not self.parent.__class__.__name__ == 'Sentence':
         logger.warn("Unexpected syntax tree")
         return
     self.features = AChunkFeatures(self, verbfeatures)
     logger.debug(self.features.as_verbose_string())
     self._conditionallyAddEvent()
示例#38
0
文件: chunks.py 项目: jasonzou/ttk
 def _getRestSent(self, structure_type):
     """Obtain the rest of the sentence as a list of tokens if structure_type is
     'flat' and as a list of constituents if structure type is 'chunked'. Log a
     warning and return a list of constituents for an unknown structure type."""
     if structure_type == 'flat':
         restSentence = utils.get_tokens(self.parent[self.position + 1:])
     elif structure_type == 'chunked':
         restSentence = self.parent[self.position + 1:]
         if structure_type != 'chunked':
             logger.warn("unknown structure type: %s" % structure_type)
     return restSentence
示例#39
0
def procTimexStart(attrs):
    global currentSentence
    global currentTimex
    global currentChunk
    global timexWithinChunk

    if currentTimex is not None or currentEvent is not None:
        logger.warn("<TIMEX3> tag within <EVENT> or another <TIMEX3> tag")
        currentSentence.trackEmbedding(TIMEX)
    else:
        currentTimex = TimexTag(attrs)
示例#40
0
文件: chunks.py 项目: tarsqi/ttk
 def _getRestSent(self, structure_type):
     """Obtain the rest of the sentence as a list of tokens if structure_type is
     'flat' and as a list of constituents if structure type is 'chunked'. Log a
     warning and return a list of constituents for an unknown structure type."""
     if structure_type == 'flat':
         restSentence = utils.get_tokens(self.parent[self.position + 1:])
     elif structure_type == 'chunked':
         restSentence = self.parent[self.position + 1:]
         if structure_type != 'chunked':
             logger.warn("unknown structure type: %s" % structure_type)
     return restSentence
示例#41
0
def procTimexStart(attrs):
    global currentSentence
    global currentTimex
    global currentChunk
    global timexWithinChunk

    if currentTimex is not None or currentEvent is not None:
        logger.warn("<TIMEX3> tag within <EVENT> or another <TIMEX3> tag")
        currentSentence.trackEmbedding(TIMEX)
    else:
        currentTimex = TimexTag(attrs)
示例#42
0
 def _get_doc_source(self):
     """Return the name of the content provider as well as the content of the DOCNO
     tag that has that information."""
     content = self._get_tag_content('DOCNO')
     content = str(content)  # in case the above returned None
     for source_identifier in ('ABC', 'APW', 'AP', 'CNN', 'NYT', 'PRI',
                               'SJMN', 'VOA', 'WSJ', 'ea', 'ed'):
         if content.startswith(source_identifier):
             return (source_identifier, content)
     logger.warn("Could not determine document source from DOCNO tag")
     return None
示例#43
0
 def _parse_tag_content(self, regexpr, tagname):
     """Return the DCT part of the tag content of tagname, requires a reqular
     expression as one of the arguments."""
     content_string = self._get_tag_content(tagname)
     result = re.compile(regexpr).match(content_string)
     if result:
         (month, day, year) = result.groups()
         return "%s%s%s" % (year, month, day)
     else:
         logger.warn("Could not get date from %s tag" % tagname)
         return _get_today()
示例#44
0
 def _get_doc_source(self):
     """Return the name of the content provider as well as the content of the
     DOCNO tag that has that information."""
     content = self._get_tag_content('DOCNO')
     content = str(content)  # in case the above returned None
     for source_identifier in ('ABC', 'APW', 'AP', 'CNN', 'NYT', 'PRI',
                               'SJMN', 'VOA', 'WSJ', 'ea', 'ed'):
         if content.startswith(source_identifier):
             return source_identifier, content
     logger.warn("Could not determine document source from DOCNO tag")
     return None
示例#45
0
 def _parse_tag_content(self, regexpr, tagname):
     """Return the DCT part of the tag content of tagname, requires a reqular
     expression as one of the arguments."""
     content_string = self._get_tag_content(tagname)
     result = re.compile(regexpr).match(content_string)
     if result:
         (month, day, year) = result.groups()
         return "%s%s%s" % (year, month, day)
     else:
         logger.warn("Could not get date from %s tag" % tagname)
         return get_today()
示例#46
0
 def alinkingContexts(self, key):
     """Returns the list of alink patterns from the dictionary."""
     form = lower(self.form)
     if self.nf_morph == VERB:
         pattern_dictionary = SLINKET_DICTS.alinkVerbsDict
     elif self.nf_morph == NOUN:
         pattern_dictionary = SLINKET_DICTS.alinkNounsDict
     else:
         logger.warn("SLINKS of type "+str(key)+" for EVENT form "+str(form)+" should be in the dict")
         return []
     return pattern_dictionary.get(form,{}).get(key,[])
示例#47
0
 def getTokens(self):
     """Return the list of tokens in a sentence."""
     # NOTE: seems to be used by the evitaNominalTrainer only
     tokenList = []
     for chunkOrToken in self.dtrs:
         if chunkOrToken.isToken():
             tokenList += [chunkOrToken]
         elif chunkOrToken.isChunk():
             tokenList += chunkOrToken.dtrs
         else:
             logger.warn("Sentence element that is not a chunk or token")
     return tokenList
示例#48
0
文件: wrapper.py 项目: tarsqi/ttk
def _run_gutime_on_file(fin, fout):
    """Run the GUTIME Perl script. Runs GUTime on an input file and creates an
    output file."""
    command = "perl TimeTag.pl %s > %s" % (fin, fout)
    pipe = subprocess.PIPE
    close_fds = False if sys.platform == 'win32' else True
    p = subprocess.Popen(command, shell=True,
                         stdin=pipe, stdout=pipe, stderr=pipe,
                         close_fds=close_fds)
    (fh_in, fh_out, fh_errors) = (p.stdin, p.stdout, p.stderr)
    for line in fh_errors:
        logger.warn(line)
示例#49
0
文件: wrapper.py 项目: mnscholz/ttk
def _run_gutime_on_file(fin, fout):
    """Run the GUTIME Perl script. Runs GUTime on an input file and creates an
    output file."""
    command = "perl TimeTag.pl %s > %s" % (fin, fout)
    pipe = subprocess.PIPE
    close_fds = False if sys.platform == 'win32' else True
    p = subprocess.Popen(command, shell=True,
                         stdin=pipe, stdout=pipe, stderr=pipe,
                         close_fds=close_fds)
    (fh_in, fh_out, fh_errors) = (p.stdin, p.stdout, p.stderr)
    for line in fh_errors:
        logger.warn(line)
示例#50
0
 def _extract_quotation(self, fragment):
     # TODO: this is a bit messy
     for idx in range(len(fragment)):
         try:
             # For some reason, it may break here (though rarely)
             if (fragment[idx].getText() == "''" and
                 (fragment[idx - 1].getText() == "," or
                  fragment[idx + 1].getText() == ",")):
                 return fragment[1:idx]
         except:
             logger.warn('Quotation could not be extracted')
     else:
         return
示例#51
0
文件: wrapper.py 项目: tarsqi/ttk
 def _export_tags(self, tagged_tokens):
     """Take the token tuples and add their pos and lemma information to the
     TagRepository in the TarsqiDocument."""
     for tagged_token in tagged_tokens:
         pos, lemma, p1, p2 = tagged_token[1:5]
         tags = self.document.tags.find_tags_at(p1)
         tags = [t for t in tags if t.end == p2 and t.name == 'lex']
         if len(tags) == 1:
             tags[0].attrs['pos'] = pos
             tags[0].attrs['lemma'] = lemma
             tags[0].attrs['origin'] += ",%s" % TAGGER
         else:
             logger.warn("More than one lex tag at position %d-%d" % (p1, p2))
示例#52
0
def procChunkStart(name):
    global currentSentence
    global currentTimex
    global currentChunk

    if currentEvent is not None:
        logger.warn("Chunk is contained within an <EVENT>")
        currentSentence.trackEmbedding(name)
    elif currentTimex is not None:
        currentSentence.trackEmbedding(name)
    elif currentChunk is not None:
        currentSentence.trackEmbedding(name)
    else:
        currentChunk = newChunk(name)
示例#53
0
文件: features.py 项目: tarsqi/ttk
 def _addInPreviousSublist(self, sublist, element):
     """Add the element to the previous element (that is, the penultimate
     element) in sublist. The elements of the sublist are lists themselves."""
     # TODO. This method is never used in the 300+ sentences in the tests in
     # testing/scripts/regression/evita/data-in. There is a use case for this
     # method though with phrases like "end up being eliminated", where "up"
     # should be added to the previous and not current list, which is now not
     # dealt with properly. Also, the logic of this method is a bit odd in
     # that when the counter is 0 the item will be added to the last (and
     # current) list, maybe want a warning instead.
     if len(sublist) >= self.counter-1:
         sublist[self.counter-1].append(element)
     else:
         # not sure whether this can actually occur
         logger.warn("list should be longer")
示例#54
0
文件: chunker.py 项目: tarsqi/ttk
 def _update_element(self, element):
     """Uses the orphans in the TarsqiTree of the element to update chunks."""
     # NOTE: this is generic sounding, but is really only meant for timexes
     # TODO: maybe rename while the above is the case
     doctree = create_tarsqi_tree(self.tarsqidoc, element)
     for orphan in doctree.orphans:
         sentence = self._get_containing_sentence(doctree, orphan)
         if sentence is None:
             logger.warn("No sentence contains %s" % orphan)
             continue
         nodes = [n for n in sentence.all_nodes() if n.overlaps(orphan)]
         nodes = [n for n in nodes if n is not sentence and not n.isToken()]
         #self._debug(orphan, sentence, nodes)
         self._remove_overlapping_chunks(nodes)
     self._add_chunks_for_timexes(element)
示例#55
0
文件: main.py 项目: tarsqi/ttk
 def run_timex_linking(self):
     """Apply the rules that govern relations between TIMEX3 tags. Only
     applies to TIMEX3 tags with type=DATE."""
     # TODO: add a DCT TIMEX tag if it is not in the tags dictionary, but
     # maybe check first whether it is in the dictionary in case we care
     # about duplications (see https://github.com/tarsqi/ttk/issues/10 and
     # https://github.com/tarsqi/ttk/issues/13)
     timexes = self.tarsqidoc.tags.find_tags(TIMEX)
     timexes = [t for t in timexes if t.attrs[TYPE] == 'DATE']
     pairs = _timex_pairs(timexes)
     for timex1, timex2 in pairs:
         try:
             self._create_timex_link(timex1, timex2)
         except Exception:
             # TODO: these are very common caused usually because one of the
             # timexes does not have a value, should look into this and
             # confirm this is always a GUTime issue.
             logger.warn("Error linking:\n%s\n%s" % (timex1, timex2))