def getEventClass(self): try: headString = self.head.getText() except AttributeError: # This is used when the head is None, which can be the # case for some weird (and incorrect) chunks, like [to/TO] # (MV 11//08/07) return None # may want to use forms.be (MV 11/08/07) if headString in ['was', 'were', 'been']: head = 'is' else: head = DictVerbStems.get(headString, headString.lower()) # this was indented, which was probably not the idea (MV 11/8/07) try: if forms.istateprog.match(head): return 'I_STATE' elif forms.reportprog.match(head): return 'REPORTING' elif forms.percepprog.match(head): return 'PERCEPTION' elif forms.iactionprog.match(head): return 'I_ACTION' elif forms.aspect1prog.match(head): return 'ASPECTUAL' elif forms.aspect2prog.match(head): return 'ASPECTUAL' elif forms.aspect3prog.match(head): return 'ASPECTUAL' elif forms.aspect4prog.match(head): return 'ASPECTUAL' elif forms.aspect5prog.match(head): return 'ASPECTUAL' elif forms.stateprog.match(head): return 'STATE' else: return 'OCCURRENCE' except: logger.warn("PROBLEM with noun object again. Verify.")
def _printSequence(self, sequence, depth): """Given a sentence or a piece of it, print the list of chunks and tokens it contains. 'depth' establishes the number of tabs to be printed for each item, in order to display it in a hierarchical manner. """ try: for item in sequence: if item.nodeType[-14:] == 'AdjectiveToken': logger.debug(depth * "\t" + "ADJ TOKEN: " + item.getText() + "\t" + item.pos + "\t\tEvent:" + str(item.event)) elif item.nodeType[-5:] == 'Token': logger.debug(depth * "\t" + "TOKEN: " + item.getText() + "\t" + item.pos + "\t\tEvent:" + str(item.event)) elif item.nodeType[-5:] == 'Chunk': logger.debug(depth * "\t" + "CHUNK: " + item.nodeType + "\t\tEvent:" + str(item.event)) elif item.nodeType == EVENT: logger.debug(depth * "\t" + "EVENT: " + item.text + "\t" + item.pos) elif item.nodeType == TIMEX: logger.debug(depth * "\t" + "TIMEX: " + item.getText()) else: raise "ERROR: unknown item type: " + item.nodeType except: logger.warn('Debugging error')
def _moderate_dct_vals(self): """There are five places where a DCT can be expressed: the DCT handed in with the --dct option or defined in the config file, the DCT from the metadata on the TarsqiDocument, the DCT from the metadata on the SourceDoc, DCTs from the TagRepository on the TarsqiDocument and DCTs from the TagRepository on the SourceDoc. The first three are single values or None, the other two are lists of any length. The order of these five is significant in that a DCT earlier on the list if given precedence over a DCT later on the list. Collects all the DCT values and picks the very first one, or today's date if no DCTs are available. Logs a warning if the DCTs do not all have the same value.""" dcts = [] for dct_val in [self.tarsqidoc.options.dct, self.tarsqidoc.metadata.get('dct'), self.tarsqidoc.sourcedoc.metadata.get('dct'), _get_dct_values(self.tarsqidoc.sourcedoc.tags), _get_dct_values(self.tarsqidoc.tags)]: if dct_val is None: # this is the case where there is no DCT in options or metadata continue elif isinstance(dct_val, list): dcts.extend(dct_val) else: dcts.append(dct_val) if len(set(dcts)) > 1: logger.warn("WARNING: more than one DCT value available") dct = dcts[0] if dcts else _get_today() self.tarsqidoc.metadata['dct'] = dct
def process_fragments(self): """Set fragment names, create the vectors for each fragment, run the classifier and add links from the classifier to the fragments.""" os.chdir(self.DIR_LINK_MERGER + os.sep + 'sputlink') perl = '/usr/local/ActivePerl-5.8/bin/perl' perl = 'perl' perl = self.tarsqi_instance.getopt_perl() for fragment in self.fragments: # set fragment names base = fragment[0] in_fragment = os.path.join(self.DIR_DATA, base+'.'+self.CREATION_EXTENSION) tmp_fragment = os.path.join(self.DIR_DATA, base+'.'+self.TMP_EXTENSION) out_fragment = os.path.join(self.DIR_DATA, base+'.'+self.RETRIEVAL_EXTENSION) # process them command = "%s merge.pl %s %s" % (perl, in_fragment, tmp_fragment) (i, o, e) = os.popen3(command) for line in e: if line.lower().startswith('warn'): logger.warn('MERGING: ' + line) else: logger.error('MERGING: ' + line) for line in o: logger.debug('MERGING: ' + line) self._add_tlinks_to_fragment(in_fragment, tmp_fragment, out_fragment) os.chdir(TTK_ROOT)
def _intersect_constraints(self, edge, constraint): """Intersect the constraint that was just derived with the one already on the edge. There are three cases: (1) the new constraint, if it is the one originally handed to the propagate() function, introduces an inconsistency; (2) the new constraint is identical to the one already there and can be ignored; (3) the intersection of the new constraint with the old constraint is the same as the old constraint; and (4) the new constraint is more specific than the already existing constraint. The method returns False in the first two cases and the intersection in the last case.""" edge = self.edges[constraint.node1][constraint.node2] new_relset = constraint.relset existing_relset = edge.relset intersection = intersect_relations(new_relset, existing_relset) debug(2, "INTERSECT NEW {%s} WITH EXISTING {%s} --> {%s}" % (constraint.relset, edge.relset, intersection)) if intersection == '': status = 'INCONSISTENT' logger.warn("Inconsistent new contraint: %s" % constraint) logger.warn("Clashes with: [%s] (derived from %s)" % (edge.constraint, edge.constraint.history_string())) elif new_relset == existing_relset: status = 'NEW=EXISTING' elif intersection == existing_relset: status = 'INTERSECTION=EXISTING' else: status = 'INTERSECTION-IS-MORE-SPECIFIC' debug(2, "STATUS: %s" % status) return (status, intersection)
def _add_tlinks_to_fragment(self, in_fragment, tmp_fragment, out_fragment): """Take the links from the merged tlinks and add them into the fragment. Based on the method with the same name in the classifier wrapper.""" xmldoc1 = Parser().parse_file(open(in_fragment,'r')) xmldoc2 = Parser().parse_file(open(tmp_fragment,'r')) xmldoc1.remove_tags(TLINK) for tlink in xmldoc2.get_tags(TLINK): reltype = tlink.attrs[RELTYPE] id1 = tlink.attrs.get(EVENT_INSTANCE_ID, None) if not id1: id1 = tlink.attrs.get(TIME_ID, None) if not id1: logger.warn("Could not find id1 in " + tlink.content) id2 = tlink.attrs.get(RELATED_TO_EVENT_INSTANCE, None) if not id2: id2 = tlink.attrs.get(RELATED_TO_TIME, None) if not id2: logger.warn("Could not find id2 in " + tlink.content) #origin = CLASSIFIER + ' ' + tlink.attrs.get(CONFIDENCE,'') origin = tlink.attrs.get('origin','') xmldoc1.add_tlink(reltype, id1, id2, origin) xmldoc1.save_to_file(out_fragment)
def feature_value(self, name): # TODO: can probably use the local attrs dictionary for many of these if name == 'eventStatus': return '1' elif name == 'nodeType': return self.__class__.__name__ elif name in (EVENTID, EIID, CLASS, TENSE, ASPECT, EPOS, STEM): return self.tree.events[self.eid][name] elif name == MOD: return self._get_attribute(name, 'NONE') elif name == POL: return self._get_attribute(name, 'POS') elif name in ('text', FORM): if self.tree.events.has_key(self.eid): return self.tree.events[self.eid][FORM] else: logger.warn("Event %s is not stored in the events on the TarsqiTree" % self) return ' '.join([t.text for t in get_tokens(self)]) elif name == POS: try: return self.tree.events[self.eid][POS] except: # I don't remember whether POS has a particular use here # or is a left over from prior times logger.warn("Returning 'epos' instead of 'pos' value") return self.tree.events[self.eid][EPOS] else: raise AttributeError, name
def createEvent(self): logger.debug("createEvent in VerbChunk") #self.pretty_print() GramVChList = self.gramChunk() # do not attempt to create an event if there are no true # chunks in there true_chunks = GramVChList.trueChunkLists if len(true_chunks) == 1 and not true_chunks[0]: return # also skip if there is no content at all if len(GramVChList) == 0: logger.warn("Obtaining an empty GramVChList") # simple case elif len(GramVChList) == 1: logger.debug("len(GramVChList) == 1") self._createEventOnRightmostVerb(GramVChList[-1]) # complex case else: logger.debug("len(GramVChList) > 1:" + str(len(GramVChList))) lastIdx = len(GramVChList)-1 for idx in range(len(GramVChList)): gramVCh = GramVChList[idx] if idx == lastIdx: self._createEventOnRightmostVerb(gramVCh) else: logger.debug("[Not Last] " + gramVCh.as_extended_string()) if not gramVCh.isAuxVerb(): self._processEventInChunk(gramVCh)
def createEvent(self): logger.debug("createEvent in VerbChunk") #self.pretty_print() GramVChList = self.gramChunk() # do not attempt to create an event if there are no true # chunks in there true_chunks = GramVChList.trueChunkLists if len(true_chunks) == 1 and not true_chunks[0]: return # also skip if there is no content at all if len(GramVChList) == 0: logger.warn("Obtaining an empty GramVChList") # simple case elif len(GramVChList) == 1: logger.debug("len(GramVChList) == 1") self._createEventOnRightmostVerb(GramVChList[-1]) # complex case else: logger.debug("len(GramVChList) > 1:" + str(len(GramVChList))) lastIdx = len(GramVChList) - 1 for idx in range(len(GramVChList)): gramVCh = GramVChList[idx] if idx == lastIdx: self._createEventOnRightmostVerb(gramVCh) else: logger.debug("[Not Last] " + gramVCh.as_extended_string()) if not gramVCh.isAuxVerb(): self._processEventInChunk(gramVCh)
def _intersect_constraints(self, edge, constraint): """Intersect the constraint that was just derived with the one already on the edge. There are three cases: (1) the new constraint, if it is the one originally handed to the propagate() function, introduces an inconsistency; (2) the new constraint is identical to the one already there and can be ignored; (3) the intersection of the new constraint with the old constraint is the same as the old constraint; and (4) the new constraint is more specific than the already existing constraint. The method returns False in the first two cases and the intersection in the last case.""" edge = self.edges[constraint.node1][constraint.node2] new_relset = constraint.relset existing_relset = edge.relset intersection = intersect_relations(new_relset, existing_relset) debug( 2, "INTERSECT NEW {%s} WITH EXISTING {%s} --> {%s}" % (constraint.relset, edge.relset, intersection)) if intersection == '': status = 'INCONSISTENT' logger.warn("Inconsistent new contraint: %s" % constraint) logger.warn("Clashes with: [%s] (derived from %s)" % (edge.constraint, edge.constraint.history_string())) elif new_relset == existing_relset: status = 'NEW=EXISTING' elif intersection == existing_relset: status = 'INTERSECTION=EXISTING' else: status = 'INTERSECTION-IS-MORE-SPECIFIC' debug(2, "STATUS: %s" % status) return (status, intersection)
def addEvent(self, event): """Takes an instance of evita.event.Event and adds it to the TagRepository on the TarsqiDocument. Does not add it if there is already an event at the same location.""" # NOTE: we now always have one token on this list, if there are more in # a future implementation we takes the last, but what probably should # happen is that we take the begin offset from the first and the end # offset from the last token. token = event.tokens[-1] if self.tarsqidoc.has_event(token.begin, token.end): logger.warn("There already is an event at that location.") else: event_attrs = dict(event.attrs) # with the current implementation, there is always one instance per # event, so we just reuse the event identifier for the instance eid = self.tarsqidoc.next_event_id() eiid = "ei%s" % eid[1:] event_attrs['eid'] = eid event_attrs['eiid'] = eiid # TODO: at least the second test does not seem needed anymore event_attrs = { k: v for k, v in event_attrs.items() if v is not None and k is not 'eventID' } self.tarsqidoc.add_event(token.begin, token.end, event_attrs)
def insert(self, tag): """Insert a Tag in the node. This could be insertion in one of the node's daughters, or insertion in the node's daughters list. Log a warning if the tag cannot be inserted.""" # first check if tag offsets fit in self offsets if tag.begin < self.begin or tag.end > self.end: pass # add tag as first daughter if there are no daughters elif not self.dtrs: self.dtrs.append(Node(tag, self, self.tree)) else: # find the index of the daughter that the tag would fit in and # insert the tag into the daughter idx = self._find_dtr_idx(tag) if idx is not None: self._insert_tag_into_dtr(tag, idx) else: # otherwise, find the insert point for the tag and insert it in # the dtrs list dtrs_idx = self._find_gap_idx(tag) if dtrs_idx is not None: self.dtrs.insert(dtrs_idx, Node(tag, self, self.tree)) else: # otherwise, find the span of dtrs that the tag includes, # replace the span with the tag and insert the span into the # tag span = self._find_span_idx(tag) if span: self._replace_span_with_tag(tag, span) else: # log warning if the tag cannot be inserted # TODO: maybe downgrade to debug statement logger.warn("Cannot insert %s" % tag) raise NodeInsertionError
def _export(self, text): """Export preprocessing information to the tag repository. Updates the TagRepository using the preprocessing result.""" ctag = None for sentence in text: sentence_attrs = { 'id': TagId.next('s'), 'origin': PREPROCESSOR } stag = Tag('s', None, None, sentence_attrs) for token in sentence: if self._is_tag(token): if not token.startswith('</'): ctag = Tag(token[1:-1], None, None, { 'id': TagId.next('c'), 'origin': PREPROCESSOR }) else: ctag.end = last_ltag.end self.document.tags.append(ctag) ctag = None elif type(token) == TupleType: ltag = self._make_ltag(token) self.document.tags.append(ltag) if stag.begin is None: stag.begin = token[3] if ctag is not None and ctag.begin is None: ctag.begin = ltag.begin last_end_offset = token[4] last_ltag = ltag else: logger.warn('Unexpected token type') stag.end = last_ltag.end self.document.tags.append(stag) # this indexing is needed because we bypassed the add_tag method on # TagRepository and instead directly appended to the tags list self.document.tags.index()
def _export(self, text): """Export preprocessing information to the tag repository. Updates the TagRepository using the preprocessing result.""" ctag = None for sentence in text: sentence_attrs = {'id': TagId.next('s'), 'origin': PREPROCESSOR} stag = Tag('s', None, None, sentence_attrs) for token in sentence: if self._is_tag(token): if not token.startswith('</'): ctag = Tag(token[1:-1], None, None, { 'id': TagId.next('c'), 'origin': PREPROCESSOR }) else: ctag.end = last_ltag.end self.document.tags.append(ctag) ctag = None elif type(token) == tuple: ltag = self._make_ltag(token) self.document.tags.append(ltag) if stag.begin is None: stag.begin = token[3] if ctag is not None and ctag.begin is None: ctag.begin = ltag.begin last_end_offset = token[4] last_ltag = ltag else: logger.warn('Unexpected token type') stag.end = last_ltag.end self.document.tags.append(stag) # this indexing is needed because we bypassed the add_tag method on # TagRepository and instead directly appended to the tags list self.document.tags.index()
def createEvent(self, verbfeatures=None, imported_events=None): """Try to create an event in the NounChunk. Checks whether the nominal is an event candidate, then conditionally adds it. The verbfeatures dictionary is used when a governing verb hands in its features to a nominal in a predicative complement. The imported_events is handed in when Tarsqi tries to import events from a previous annotation.""" logger.debug("NounChunk.createEvent(verbfeatures=%s)" % verbfeatures) if self.isEmpty(): # this happened at some point due to a crazy bug in some old code # that does not exist anymore, log a warning in case this returns logger.warn("There are no dtrs in the NounChunk") else: self.features = NChunkFeatures(self, verbfeatures) logger.debug(self.features.as_verbose_string()) # don't bother if the head already is an event if self.features.head.isEvent(): logger.debug("Nominal already contains an event") # Even if preceded by a BE or a HAVE form, only tagging NounChunks # headed by an eventive noun, so "was an intern" will NOT be tagged elif self._passes_syntax_test(): imported_event = self._get_imported_event_for_chunk( imported_events) #print imported_event if imported_event is not None: self._conditionally_add_imported_event(imported_event) elif self._passes_semantics_test(): self._conditionallyAddEvent()
def _addInCurrentSublist(self, sublist, element): """Add the element to the current element (that is, the last element) in sublist. The elements of the sublist are lists themselves.""" if len(sublist) - self.counter == 1: sublist[self.counter].append(element) else: logger.warn("length of chunk list and counter are out of sync")
def getEventClass(self): """Return the event class for the nominal, using the regelar expressions in the library.""" try: text = self.head.getText() except AttributeError: # This is used when the head is None, which can be the case for some # weird (and incorrect) chunks, like [to/TO] # TODO: make sure this cannot happen logger.warn("Cannot assign class to incorrect chunk") return None stem = 'is' if text in forms.be else DictVerbStems.get(text, text.lower()) try: if forms.istateprog.match(stem): return 'I_STATE' elif forms.reportprog.match(stem): return 'REPORTING' elif forms.percepprog.match(stem): return 'PERCEPTION' elif forms.iactionprog.match(stem): return 'I_ACTION' elif forms.aspect1prog.match(stem): return 'ASPECTUAL' elif forms.aspect2prog.match(stem): return 'ASPECTUAL' elif forms.aspect3prog.match(stem): return 'ASPECTUAL' elif forms.aspect4prog.match(stem): return 'ASPECTUAL' elif forms.aspect5prog.match(stem): return 'ASPECTUAL' elif forms.stateprog.match(stem): return 'STATE' else: return 'OCCURRENCE' except: logger.warn("Error running event class patterns")
def __getitem__(self, index): """Get an item from the dtrs variable.""" if index is None: logger.warn("Given index to __getitem__ in Sentence is None") return None else: return self.dtrs[index]
def getEventClass(self): """Return the event class for the nominal, using the regeluar expressions in the library.""" try: text = self.head.getText() except AttributeError: # This is used when the head is None, which can be the case for some # weird (and incorrect) chunks, like [to/TO] (MV 11//08/07) logger.warn("Cannot assign class to incorrect chunk") return None stem = 'is' if text in forms.be else DictVerbStems.get( text, text.lower()) try: if forms.istateprog.match(stem): return 'I_STATE' elif forms.reportprog.match(stem): return 'REPORTING' elif forms.percepprog.match(stem): return 'PERCEPTION' elif forms.iactionprog.match(stem): return 'I_ACTION' elif forms.aspect1prog.match(stem): return 'ASPECTUAL' elif forms.aspect2prog.match(stem): return 'ASPECTUAL' elif forms.aspect3prog.match(stem): return 'ASPECTUAL' elif forms.aspect4prog.match(stem): return 'ASPECTUAL' elif forms.aspect5prog.match(stem): return 'ASPECTUAL' elif forms.stateprog.match(stem): return 'STATE' else: return 'OCCURRENCE' except: logger.warn("Error running event class patterns")
def createEvent(self, verbfeatures=None, imported_events=None): """Try to create an event in the NounChunk. Checks whether the nominal is an event candidate, then conditionally adds it. The verbfeatures dictionary is used when a governing verb hands in its features to a nominal in a predicative complement. The imported_events is handed in when Tarsqi tries to import events from a previous annotation.""" logger.debug("NounChunk.createEvent(verbfeatures=%s)" % verbfeatures) if self.isEmpty(): # this happened at some point due to a crazy bug in some old code # that does not exist anymore, log a warning in case this returns logger.warn("There are no dtrs in the NounChunk") else: self.features = NChunkFeatures(self, verbfeatures) logger.debug(self.features.as_verbose_string()) # don't bother if the head already is an event if self.features.head.isEvent(): logger.debug("Nominal already contains an event") # Even if preceded by a BE or a HAVE form, only tagging NounChunks # headed by an eventive noun, so "was an intern" will NOT be tagged elif self._passes_syntax_test(): imported_event = self._get_imported_event_for_chunk(imported_events) #print imported_event if imported_event is not None: self._conditionally_add_imported_event(imported_event) elif self._passes_semantics_test(): self._conditionallyAddEvent()
def feature_value(self, name): # TODO: can probably use the local attrs dictionary for many of these if name == 'eventStatus': return '1' elif name == 'nodeType': return self.__class__.__name__ elif name in (EVENTID, EIID, CLASS, TENSE, ASPECT, EPOS, STEM): return self.tree.events[self.eid][name] elif name == MOD: return self._get_attribute(name, 'NONE') elif name == POL: return self._get_attribute(name, 'POS') elif name in ('text', FORM): if self.tree.events.has_key(self.eid): return self.tree.events[self.eid][FORM] else: logger.warn( "Event %s is not stored in the events on the TarsqiTree" % self) return ' '.join([t.text for t in get_tokens(self)]) elif name == POS: try: return self.tree.events[self.eid][POS] except: # I don't remember whether POS has a particular use here # or is a left over from prior times logger.warn("Returning 'epos' instead of 'pos' value") return self.tree.events[self.eid][EPOS] else: raise AttributeError, name
def _consume_term(self, term, idx): """Now that we now that a term starts at index idx, read the whole term and, if it matches a few requirements, add it to the chunk_tags dictionary. A term is an instance of docmodel.document.Tag.""" begin_idx = idx end_idx = -1 tag = self.sentence[idx] while term.begin <= tag[3] < term.end: end_idx = idx idx += 1 if idx >= len(self.sentence): break tag = self.sentence[idx] final_tag = self.sentence[idx-1] if (end_idx > -1) and (final_tag[4] == term.end): # constituent found, set tags and return index after end pos = final_tag[1] if pos.startswith('V'): chunk_type = VG elif pos.startswith('N'): chunk_type = NG else: # do not create a chunk if this was not headed by a noun or verb return begin_idx self._set_tags(chunk_type, begin_idx, end_idx) return end_idx + 1 else: # none found, return the initial index, this should actually not # happen so log a warning logger.warn("Could not consume full term") return begin_idx
def _consume_term(self, term, idx): """Now that we now that a term starts at index idx, read the whole term and, if it matches a few requirements, add it to the chunk_tags dictionary. A term is an instance of docmodel.document.Tag.""" begin_idx = idx end_idx = -1 tag = self.sentence[idx] while term.begin <= tag[3] < term.end: end_idx = idx idx += 1 if idx >= len(self.sentence): break tag = self.sentence[idx] final_tag = self.sentence[idx - 1] if (end_idx > -1) and (final_tag[4] == term.end): # constituent found, set tags and return index after end pos = final_tag[1] if pos.startswith('V'): chunk_type = VG elif pos.startswith('N'): chunk_type = NG else: # do not create a chunk if this was not headed by a noun or verb return begin_idx self._set_tags(chunk_type, begin_idx, end_idx) return end_idx + 1 else: # none found, return the initial index, this should actually not # happen so log a warning logger.warn("Could not consume full term") return begin_idx
def process_fragments(self): """Calls, for each fragment, the Perl scripts that implement GUTime and merges the results back into the fragment.""" os.chdir(self.DIR_GUTIME) self.dct = self.tarsqi_instance.metadata['dct'] for fragment in self.fragments: # set fragment names base = fragment[0] in_file = "%s%s%s.%s" % (self.DIR_DATA, os.sep, base, self.CREATION_EXTENSION) tmp1_file = "%s%s%s.%s" % (self.DIR_DATA, os.sep, base, self.TMP1_EXTENSION) tmp2_file = "%s%s%s.%s" % (self.DIR_DATA, os.sep, base, self.TMP2_EXTENSION) out_file = "%s%s%s.%s" % (self.DIR_DATA, os.sep, base, self.RETRIEVAL_EXTENSION) print '-shit come here---------------------------------' print in_file print tmp1_file print tmp2_file print out_file # process them command = "perl gutime.pl -dct %s -t fragment %s %s" % ( self.dct, in_file, tmp1_file) (fh_in, fh_out, fh_errors) = os.popen3(command) for line in fh_errors: logger.warn(line) merge_tags(in_file, tmp1_file, tmp2_file) self.btime.process(tmp2_file, out_file) os.chdir(TTK_ROOT)
def getHead(self): """Return the head, which is the last element of the core in self.trueChunk, return None if there is no such core.""" if self.trueChunk: return self.trueChunk[-1] else: logger.warn("empty trueChunk, head is set to None") return None
def procEventStart(attrs): global currentEvent #print "Current Timex:", currentTimex, "||", "Current Event:", currentEvent if currentTimex is not None or currentEvent is not None: logger.warn("<EVENT> within <TIMEX3> or another <EVENT> tag") currentSentence.trackEmbedding(EVENT) else: currentEvent = EventTag(attrs)
def procEventStart(attrs): global currentEvent # print "Current Timex:", currentTimex, "||", "Current Event:", currentEvent if currentTimex is not None or currentEvent is not None: logger.warn("<EVENT> within <TIMEX3> or another <EVENT> tag") currentSentence.trackEmbedding(EVENT) else: currentEvent = EventTag(attrs)
def getHead(self): """Return the head of the GramVChunk, which is the last element of the core in self.trueChunk, return None if there is no such core.""" if self.trueChunk: return self.trueChunk[-1] else: logger.warn("empty trueChunk, head is set to None") return None
def _get_id(self, prefix, attrs, line): """Get the eiid or tid for the first or second object in the vector. The prefix is '0' or '1' and determines which object's id is returned.""" id = attrs.get(prefix + EIID, attrs.get(prefix + TID, None)) if not id: logger.warn("Could not find id in " + line) return id
def _get_doc_source(self, xmldoc): """Returns the name of the content provider.""" tag_DOCNO = xmldoc.tags['DOCNO'][0] content = tag_DOCNO.collect_content().strip() # TimeBank has only these providers for str in ('ABC', 'APW', 'AP', 'CNN', 'NYT', 'PRI', 'SJMN', 'VOA', 'WSJ', 'ea', 'ed'): if content.startswith(str): return str logger.warn("Could not determine document source from DOCNO tag") return 'GENERIC'
def _get_tag_content(self, tagname): """Return the text content of the first tag with name tagname, return None if there is no such tag.""" try: tag = self._get_source().tags.find_tags(tagname)[0] content = self._get_source().text[tag.begin:tag.end].strip() return content except IndexError: logger.warn("Cannot get the %s tag in this document" % tagname) return None
def createAdjEvent(self, gramvchunk=None): """Processes the adjective after a copular verb and make it an event if the adjective has an event class.""" logger.debug("AdjectiveToken.createAdjEvent(gramvchunk)") if not self.parent.__class__.__name__ == 'Sentence': logger.warn("Unexpected syntax tree") return self.gramchunk = GramAChunk(self, gramvchunk) logger.debug(self.gramchunk.as_verbose_string()) self._conditionallyAddEvent()
def _get_tag_content(self, tagname): """Return the text content of the first tag with name tagname, return None if there is no such tag.""" try: tag = self.get_source().tags.find_tags(tagname)[0] content = self.get_source().text[tag.begin:tag.end].strip() return content except IndexError: logger.warn("Cannot get the %s tag in this document" % tagname) return None
def _get_doc_source(self, xmldoc): """Returns the name of the content provider.""" tag_DOCNO = xmldoc.tags["DOCNO"][0] content = tag_DOCNO.collect_content().strip() # TimeBank has only these providers for str in ("ABC", "APW", "AP", "CNN", "NYT", "PRI", "SJMN", "VOA", "WSJ", "ea", "ed"): if content.startswith(str): return str logger.warn("Could not determine document source from DOCNO tag") return "GENERIC"
def createAdjEvent(self, verbfeatures=None): """Processes the adjective after a copular verb and make it an event if the adjective has an event class.""" logger.debug("AdjectiveToken.createAdjEvent(verbfeatures)") if not self.parent.__class__.__name__ == 'Sentence': logger.warn("Unexpected syntax tree") return self.features = AChunkFeatures(self, verbfeatures) logger.debug(self.features.as_verbose_string()) self._conditionallyAddEvent()
def _getRestSent(self, structure_type): """Obtain the rest of the sentence as a list of tokens if structure_type is 'flat' and as a list of constituents if structure type is 'chunked'. Log a warning and return a list of constituents for an unknown structure type.""" if structure_type == 'flat': restSentence = utils.get_tokens(self.parent[self.position + 1:]) elif structure_type == 'chunked': restSentence = self.parent[self.position + 1:] if structure_type != 'chunked': logger.warn("unknown structure type: %s" % structure_type) return restSentence
def procTimexStart(attrs): global currentSentence global currentTimex global currentChunk global timexWithinChunk if currentTimex is not None or currentEvent is not None: logger.warn("<TIMEX3> tag within <EVENT> or another <TIMEX3> tag") currentSentence.trackEmbedding(TIMEX) else: currentTimex = TimexTag(attrs)
def _get_doc_source(self): """Return the name of the content provider as well as the content of the DOCNO tag that has that information.""" content = self._get_tag_content('DOCNO') content = str(content) # in case the above returned None for source_identifier in ('ABC', 'APW', 'AP', 'CNN', 'NYT', 'PRI', 'SJMN', 'VOA', 'WSJ', 'ea', 'ed'): if content.startswith(source_identifier): return (source_identifier, content) logger.warn("Could not determine document source from DOCNO tag") return None
def _parse_tag_content(self, regexpr, tagname): """Return the DCT part of the tag content of tagname, requires a reqular expression as one of the arguments.""" content_string = self._get_tag_content(tagname) result = re.compile(regexpr).match(content_string) if result: (month, day, year) = result.groups() return "%s%s%s" % (year, month, day) else: logger.warn("Could not get date from %s tag" % tagname) return _get_today()
def _get_doc_source(self): """Return the name of the content provider as well as the content of the DOCNO tag that has that information.""" content = self._get_tag_content('DOCNO') content = str(content) # in case the above returned None for source_identifier in ('ABC', 'APW', 'AP', 'CNN', 'NYT', 'PRI', 'SJMN', 'VOA', 'WSJ', 'ea', 'ed'): if content.startswith(source_identifier): return source_identifier, content logger.warn("Could not determine document source from DOCNO tag") return None
def _parse_tag_content(self, regexpr, tagname): """Return the DCT part of the tag content of tagname, requires a reqular expression as one of the arguments.""" content_string = self._get_tag_content(tagname) result = re.compile(regexpr).match(content_string) if result: (month, day, year) = result.groups() return "%s%s%s" % (year, month, day) else: logger.warn("Could not get date from %s tag" % tagname) return get_today()
def alinkingContexts(self, key): """Returns the list of alink patterns from the dictionary.""" form = lower(self.form) if self.nf_morph == VERB: pattern_dictionary = SLINKET_DICTS.alinkVerbsDict elif self.nf_morph == NOUN: pattern_dictionary = SLINKET_DICTS.alinkNounsDict else: logger.warn("SLINKS of type "+str(key)+" for EVENT form "+str(form)+" should be in the dict") return [] return pattern_dictionary.get(form,{}).get(key,[])
def getTokens(self): """Return the list of tokens in a sentence.""" # NOTE: seems to be used by the evitaNominalTrainer only tokenList = [] for chunkOrToken in self.dtrs: if chunkOrToken.isToken(): tokenList += [chunkOrToken] elif chunkOrToken.isChunk(): tokenList += chunkOrToken.dtrs else: logger.warn("Sentence element that is not a chunk or token") return tokenList
def _run_gutime_on_file(fin, fout): """Run the GUTIME Perl script. Runs GUTime on an input file and creates an output file.""" command = "perl TimeTag.pl %s > %s" % (fin, fout) pipe = subprocess.PIPE close_fds = False if sys.platform == 'win32' else True p = subprocess.Popen(command, shell=True, stdin=pipe, stdout=pipe, stderr=pipe, close_fds=close_fds) (fh_in, fh_out, fh_errors) = (p.stdin, p.stdout, p.stderr) for line in fh_errors: logger.warn(line)
def _extract_quotation(self, fragment): # TODO: this is a bit messy for idx in range(len(fragment)): try: # For some reason, it may break here (though rarely) if (fragment[idx].getText() == "''" and (fragment[idx - 1].getText() == "," or fragment[idx + 1].getText() == ",")): return fragment[1:idx] except: logger.warn('Quotation could not be extracted') else: return
def _export_tags(self, tagged_tokens): """Take the token tuples and add their pos and lemma information to the TagRepository in the TarsqiDocument.""" for tagged_token in tagged_tokens: pos, lemma, p1, p2 = tagged_token[1:5] tags = self.document.tags.find_tags_at(p1) tags = [t for t in tags if t.end == p2 and t.name == 'lex'] if len(tags) == 1: tags[0].attrs['pos'] = pos tags[0].attrs['lemma'] = lemma tags[0].attrs['origin'] += ",%s" % TAGGER else: logger.warn("More than one lex tag at position %d-%d" % (p1, p2))
def procChunkStart(name): global currentSentence global currentTimex global currentChunk if currentEvent is not None: logger.warn("Chunk is contained within an <EVENT>") currentSentence.trackEmbedding(name) elif currentTimex is not None: currentSentence.trackEmbedding(name) elif currentChunk is not None: currentSentence.trackEmbedding(name) else: currentChunk = newChunk(name)
def _addInPreviousSublist(self, sublist, element): """Add the element to the previous element (that is, the penultimate element) in sublist. The elements of the sublist are lists themselves.""" # TODO. This method is never used in the 300+ sentences in the tests in # testing/scripts/regression/evita/data-in. There is a use case for this # method though with phrases like "end up being eliminated", where "up" # should be added to the previous and not current list, which is now not # dealt with properly. Also, the logic of this method is a bit odd in # that when the counter is 0 the item will be added to the last (and # current) list, maybe want a warning instead. if len(sublist) >= self.counter-1: sublist[self.counter-1].append(element) else: # not sure whether this can actually occur logger.warn("list should be longer")
def _update_element(self, element): """Uses the orphans in the TarsqiTree of the element to update chunks.""" # NOTE: this is generic sounding, but is really only meant for timexes # TODO: maybe rename while the above is the case doctree = create_tarsqi_tree(self.tarsqidoc, element) for orphan in doctree.orphans: sentence = self._get_containing_sentence(doctree, orphan) if sentence is None: logger.warn("No sentence contains %s" % orphan) continue nodes = [n for n in sentence.all_nodes() if n.overlaps(orphan)] nodes = [n for n in nodes if n is not sentence and not n.isToken()] #self._debug(orphan, sentence, nodes) self._remove_overlapping_chunks(nodes) self._add_chunks_for_timexes(element)
def run_timex_linking(self): """Apply the rules that govern relations between TIMEX3 tags. Only applies to TIMEX3 tags with type=DATE.""" # TODO: add a DCT TIMEX tag if it is not in the tags dictionary, but # maybe check first whether it is in the dictionary in case we care # about duplications (see https://github.com/tarsqi/ttk/issues/10 and # https://github.com/tarsqi/ttk/issues/13) timexes = self.tarsqidoc.tags.find_tags(TIMEX) timexes = [t for t in timexes if t.attrs[TYPE] == 'DATE'] pairs = _timex_pairs(timexes) for timex1, timex2 in pairs: try: self._create_timex_link(timex1, timex2) except Exception: # TODO: these are very common caused usually because one of the # timexes does not have a value, should look into this and # confirm this is always a GUTime issue. logger.warn("Error linking:\n%s\n%s" % (timex1, timex2))