def checkDocument(self, doc_id): docName = self.startup(doc_id) if not docName: return False # use an agenda-based closure algorithm. # - find all intervals referenced by tlinks # - split these into a start and end, and for each interval, add interval_start < interval_end to the database (which assumes we have only annotated proper intervals). # - for each tlink, add axioms about the points that they link to the agenda # - perform agenda based closure; before any agenda addition, check that the rule being added does not conflict with other rules. # a conflict is when we assign a new value for a pair that is already present; e.g., a<b conflicts with b=a, or b<a, or a=b # - if we ever find a conflict, then the graph is inconsistent. # - if we empty the agenda, the graph is consistent. if not runQuery('SELECT DISTINCT arg1 FROM tlinks WHERE doc_id = ' + doc_id): return intervals = set(db.cursor.fetchall()) if not runQuery('SELECT DISTINCT arg2 FROM tlinks WHERE doc_id = ' + doc_id): return intervals = intervals.union(set(db.cursor.fetchall())) # move db result from list of tuples to list of strings intervals_ = [] for interval in intervals: intervals_.append(interval[0]) intervals = set(intervals_) # fetch tlinks if not runQuery( 'SELECT arg1, reltype, arg2, lid FROM tlinks WHERE doc_id = ' + doc_id): return tlinks = db.cursor.fetchall() result = self.consistencyCheck(intervals, tlinks) if result: if cavatDebug.debug: print 'Consistent' if self.superVerbose: print self.database return True else: # only print doc name if it's not already there if not cavatDebug.debug: print "# Checking " + docName + ' (id ' + doc_id + ')' print '! ' + self.failReason return False
def checkDocument(self, doc_id): docName = self.startup(doc_id) if not docName: return False # use an agenda-based closure algorithm. # - find all intervals referenced by tlinks # - split these into a start and end, and for each interval, add interval_start < interval_end to the database (which assumes we have only annotated proper intervals). # - for each tlink, add axioms about the points that they link to the agenda # - perform agenda based closure; before any agenda addition, check that the rule being added does not conflict with other rules. # a conflict is when we assign a new value for a pair that is already present; e.g., a<b conflicts with b=a, or b<a, or a=b # - if we ever find a conflict, then the graph is inconsistent. # - if we empty the agenda, the graph is consistent. if not runQuery('SELECT DISTINCT arg1 FROM tlinks WHERE doc_id = ' + doc_id): return intervals = set(db.cursor.fetchall()) if not runQuery('SELECT DISTINCT arg2 FROM tlinks WHERE doc_id = ' + doc_id): return intervals = intervals.union(set(db.cursor.fetchall())) # move db result from list of tuples to list of strings intervals_ = [] for interval in intervals: intervals_.append(interval[0]) intervals = set(intervals_) # fetch tlinks if not runQuery('SELECT arg1, reltype, arg2, lid FROM tlinks WHERE doc_id = ' + doc_id): return tlinks = db.cursor.fetchall() result = self.consistencyCheck(intervals, tlinks) if result: if cavatDebug.debug: print 'Consistent' if self.superVerbose: print self.database return True else: # only print doc name if it's not already there if not cavatDebug.debug: print "# Checking " + docName + ' (id ' + doc_id + ')' print '! ' + self.failReason return False
def checkDocument(self, doc_id): docName = self.startup(doc_id) if not docName: return False loopedTlinks = [] # look at where we're linking an event instance to itself if not runQuery( "SELECT lid, relType, arg1, arg2 FROM tlinks WHERE arg1 = arg2 AND doc_id = " + doc_id + " ORDER BY CAST(SUBSTR(lid,2) AS SIGNED)" ): return loopedTlinks = db.cursor.fetchall() # look at where we're linking an event instance to itself if not runQuery( "SELECT lid, reltype, arg1, arg2 FROM tlinks AS t, instances AS i1, instances AS i2 WHERE t.arg1 = i1.eiid AND t.arg2 = i2.eiid AND t.doc_id = i1.doc_id AND t.doc_id = i2.doc_id AND i1.eventID = i2.eventID AND t.doc_id = " + doc_id + " ORDER BY CAST(SUBSTR(lid,2) AS SIGNED)" ): return loopedTlinks = self.uniq(loopedTlinks + db.cursor.fetchall()) if loopedTlinks: print "# Checking " + docName + " (id " + doc_id + ")" for row in loopedTlinks: if row[2] == row[3]: print "TLINK ID %s loops directly (instanceID match), type %s, event %s / %s" % ( row[0], row[1], row[2], row[3], ) else: print "TLINK ID %s may be a loop (eventID match), type %s, event %s / %s - check document manually" % ( row[0], row[1], row[2], row[3], ) return False else: if cavatDebug.debug: print "No looping TLINKs found in this document." return True
def getTlinks(self, doc_id): if not runQuery('SELECT arg1, arg2 FROM tlinks WHERE doc_id = ' + str(doc_id)): return tlinks = db.cursor.fetchall() return tlinks
def checkDocument(self, doc_id): docName = self.startup(doc_id) if not docName: return False loopedTlinks = [] # look at where we're linking an event instance to itself if not runQuery( 'SELECT lid, relType, arg1, arg2 FROM tlinks WHERE arg1 = arg2 AND doc_id = ' + doc_id + ' ORDER BY CAST(SUBSTR(lid,2) AS SIGNED)'): return loopedTlinks = db.cursor.fetchall() # look at where we're linking an event instance to itself if not runQuery( 'SELECT lid, reltype, arg1, arg2 FROM tlinks AS t, instances AS i1, instances AS i2 WHERE t.arg1 = i1.eiid AND t.arg2 = i2.eiid AND t.doc_id = i1.doc_id AND t.doc_id = i2.doc_id AND i1.eventID = i2.eventID AND t.doc_id = ' + doc_id + ' ORDER BY CAST(SUBSTR(lid,2) AS SIGNED)'): return loopedTlinks = self.uniq(loopedTlinks + db.cursor.fetchall()) if loopedTlinks: print "# Checking " + docName + ' (id ' + doc_id + ')' for row in loopedTlinks: if row[2] == row[3]: print 'TLINK ID %s loops directly (instanceID match), type %s, event %s / %s' % ( row[0], row[1], row[2], row[3]) else: print 'TLINK ID %s may be a loop (eventID match), type %s, event %s / %s - check document manually' % ( row[0], row[1], row[2], row[3]) return False else: if cavatDebug.debug: print 'No looping TLINKs found in this document.' return True
def startup(self, doc_id): if not runQuery('SELECT docname FROM documents WHERE id = ' + doc_id): # document not found print '! No document in corpus with id ' + doc_id return False results = db.cursor.fetchone() docName = str(results[0]) if cavatDebug.debug: print "# Checking " + docName + ' (id ' + doc_id + ')' return docName
def checkDocument(self, doc_id): docName = self.startup(doc_id) if not docName: return False orphans = set() # orphan cases: # timex3 not in a link # instance not in a link # event not got an instance # instance not got an event # signals not referenced by any tlink or instance # tlink referencing non-existant signal # build a list of intervals already in tlinks linkedIntervals = set() queries = [ 'SELECT DISTINCT arg2 FROM tlinks WHERE doc_id = ', 'SELECT DISTINCT arg1 FROM tlinks WHERE doc_id = ', 'SELECT DISTINCT eventInstanceID FROM slinks WHERE doc_id = ', 'SELECT DISTINCT subordinatedEventInstance FROM slinks WHERE doc_id = ', 'SELECT DISTINCT eventInstanceID FROM alinks WHERE doc_id = ', 'SELECT DISTINCT relatedToEventInstance FROM alinks WHERE doc_id = ' ] for query in queries: if not runQuery(query + doc_id): return args = list(db.cursor.fetchall()) for arg in args: linkedIntervals.add(arg[0]) # first is easiest - check for timex3s that aren't mentioned in a tlink if not runQuery('SELECT tid FROM timex3s WHERE doc_id = ' + doc_id): return timex3s = db.cursor.fetchall() for timex3 in timex3s: if timex3[0] not in linkedIntervals: orphans.add('TIMEX3 ' + str(timex3[0]) + ' not in any link') # next, check for instances not in tlinks if not runQuery('SELECT eiid FROM instances WHERE doc_id = ' + doc_id): return instances = set() for instance_ in list(db.cursor.fetchall()): instances.add(instance_[0]) for missing in instances.difference(linkedIntervals): orphans.add('INSTANCE ' + str(missing) + ' not in any link') # then, find events that don't have any instances if not runQuery('SELECT eid FROM events WHERE doc_id = ' + doc_id): return events = set() for event_ in list(db.cursor.fetchall()): events.add(event_[0]) if not runQuery( 'SELECT DISTINCT eventID FROM instances WHERE doc_id = ' + doc_id): return instancedEvents = set() for instance_ in list(db.cursor.fetchall()): instancedEvents.add(instance_[0]) for missing in events.difference(instancedEvents): orphans.add('EVENT ' + str(missing) + ' is never instanced') # instances that don't reference an event, or reference an invalid event # (instances where eventID == '') union (instancedEvents [eventID] minus events [eid]) for missing in instancedEvents.difference(events): orphans.add('INSTANCE' + str(missing) + ' references absent eventID') if not runQuery( 'SELECT eiid FROM instances WHERE (eventID = "" OR eventID IS NULL) AND doc_id = ' + doc_id): return for instance_ in list(db.cursor.fetchall()): orphans.add('INSTANCE ' + str(instance_[0]) + ' does not reference an event') # signals not referenced by any tlink, alink, slink or instance if not runQuery( 'SELECT sid FROM signals WHERE doc_id = %s AND sid NOT IN (SELECT signalID FROM tlinks WHERE doc_id = %s AND signalID IS NOT NULL) AND sid NOT IN (SELECT signalID FROM slinks WHERE doc_id = %s AND signalID IS NOT NULL) AND sid NOT IN (SELECT signalID FROM alinks WHERE doc_id = %s AND signalID IS NOT NULL) AND sid NOT IN (SELECT signalID FROM instances WHERE doc_id = %s AND signalID IS NOT NULL)' % (doc_id, doc_id, doc_id, doc_id, doc_id)): return for sid_ in list(db.cursor.fetchall()): orphans.add( 'SIGNAL ' + str(sid_[0]) + ' is not referenced by any TLINK, SLINK, ALINK or MAKEINSTANCE' ) # tlink referencing non-existant signal if not runQuery( 'SELECT lid FROM tlinks WHERE signalID != "" AND signalID NOT IN (SELECT sid FROM signals WHERE doc_id = %s) AND doc_id = %s' % (doc_id, doc_id)): return for lid_ in list(db.cursor.fetchall()): orphans.add( 'TLINK ' + str(lid_[0]) + ' references a signal that is not included in the annotation') if len(orphans) > 0: print "# Checking " + docName + ' (id ' + doc_id + ')' orphans = list(orphans) orphans.sort() for orphan in orphans: print orphan return False else: return True
def checkDocument(self, doc_id): docName = self.startup(doc_id) if not docName: return False orphans = set() # orphan cases: # timex3 not in a link # instance not in a link # event not got an instance # instance not got an event # signals not referenced by any tlink or instance # tlink referencing non-existant signal # build a list of intervals already in tlinks linkedIntervals = set() queries = ['SELECT DISTINCT arg2 FROM tlinks WHERE doc_id = ', 'SELECT DISTINCT arg1 FROM tlinks WHERE doc_id = ', 'SELECT DISTINCT eventInstanceID FROM slinks WHERE doc_id = ', 'SELECT DISTINCT subordinatedEventInstance FROM slinks WHERE doc_id = ', 'SELECT DISTINCT eventInstanceID FROM alinks WHERE doc_id = ', 'SELECT DISTINCT relatedToEventInstance FROM alinks WHERE doc_id = '] for query in queries: if not runQuery(query + doc_id): return args = list(db.cursor.fetchall()) for arg in args: linkedIntervals.add(arg[0]) # first is easiest - check for timex3s that aren't mentioned in a tlink if not runQuery('SELECT tid FROM timex3s WHERE doc_id = ' + doc_id): return timex3s = db.cursor.fetchall() for timex3 in timex3s: if timex3[0] not in linkedIntervals: orphans.add('TIMEX3 ' + str(timex3[0]) + ' not in any link') # next, check for instances not in tlinks if not runQuery('SELECT eiid FROM instances WHERE doc_id = ' + doc_id): return instances = set() for instance_ in list(db.cursor.fetchall()): instances.add(instance_[0]) for missing in instances.difference(linkedIntervals): orphans.add('INSTANCE ' + str(missing) + ' not in any link') # then, find events that don't have any instances if not runQuery('SELECT eid FROM events WHERE doc_id = ' + doc_id): return events = set() for event_ in list(db.cursor.fetchall()): events.add(event_[0]) if not runQuery('SELECT DISTINCT eventID FROM instances WHERE doc_id = ' + doc_id): return instancedEvents = set() for instance_ in list(db.cursor.fetchall()): instancedEvents.add(instance_[0]) for missing in events.difference(instancedEvents): orphans.add('EVENT ' + str(missing) + ' is never instanced') # instances that don't reference an event, or reference an invalid event # (instances where eventID == '') union (instancedEvents [eventID] minus events [eid]) for missing in instancedEvents.difference(events): orphans.add('INSTANCE' + str(missing) + ' references absent eventID') if not runQuery('SELECT eiid FROM instances WHERE (eventID = "" OR eventID IS NULL) AND doc_id = ' + doc_id): return for instance_ in list(db.cursor.fetchall()): orphans.add('INSTANCE ' + str(instance_[0]) + ' does not reference an event') # signals not referenced by any tlink, alink, slink or instance if not runQuery('SELECT sid FROM signals WHERE doc_id = %s AND sid NOT IN (SELECT signalID FROM tlinks WHERE doc_id = %s AND signalID IS NOT NULL) AND sid NOT IN (SELECT signalID FROM slinks WHERE doc_id = %s AND signalID IS NOT NULL) AND sid NOT IN (SELECT signalID FROM alinks WHERE doc_id = %s AND signalID IS NOT NULL) AND sid NOT IN (SELECT signalID FROM instances WHERE doc_id = %s AND signalID IS NOT NULL)' % (doc_id, doc_id, doc_id, doc_id, doc_id)): return for sid_ in list(db.cursor.fetchall()): orphans.add('SIGNAL ' + str(sid_[0]) + ' is not referenced by any TLINK, SLINK, ALINK or MAKEINSTANCE') # tlink referencing non-existant signal if not runQuery('SELECT lid FROM tlinks WHERE signalID != "" AND signalID NOT IN (SELECT sid FROM signals WHERE doc_id = %s) AND doc_id = %s' % (doc_id, doc_id) ): return for lid_ in list(db.cursor.fetchall()): orphans.add('TLINK ' + str(lid_[0]) + ' references a signal that is not included in the annotation') if len(orphans) > 0: print "# Checking " + docName + ' (id ' + doc_id + ')' orphans = list(orphans) orphans.sort() for orphan in orphans: print orphan return False else: return True
sqlWheres.append(conditionField +' IS NULL') whereCaption = ' when ' + conditionFieldName + ' is not defined' # process report type # a list report just shows the values as they are, without any accompanying data if t.report == 'list': pass elif t.report == 'distribution': # build a distribution report. here we will show unique values for a field, as well as their frequency in the selected corpus, showing most frequent first. # would be great to add a percentage column sqlGroup = ' GROUP BY ' + sqlFieldName sqlCount = 'COUNT(' + sqlFieldName + ') AS count ' # run a quick pre-query to see the total number of results returned if not db.runQuery('SELECT '+ sqlCount + ' FROM ' + sqlTable + ' ' + buildSqlWhereClause(sqlWheres)): continue totalRecords = db.cursor.fetchone()[0] # add the .0 after totalrecords so that float division is performed sqlField = sqlFieldName + ', ' + sqlCount + ', (COUNT('+sqlFieldName+')/'+str(totalRecords)+'.0) AS percent' # if we are generating a report about a numeric value, sort the table by that value, not by frequency; this way round, it's easier to spot lumps / import into a histogram if (sqlTable + '.' + sqlFieldName).lower() in numericFields: sqlOrder = ' ORDER BY ' + sqlFieldName + ' ASC' else: sqlOrder = ' ORDER BY count DESC' # state is either "filled" or "unfilled", showing whether or not the attributed has been specified elif t.report == 'state':
sqlWheres.append(conditionField + " IS NULL") whereCaption = " when " + conditionFieldName + " is not defined" # process report type # a list report just shows the values as they are, without any accompanying data if t.report == "list": pass elif t.report == "distribution": # build a distribution report. here we will show unique values for a field, as well as their frequency in the selected corpus, showing most frequent first. # would be great to add a percentage column sqlGroup = " GROUP BY " + sqlFieldName sqlCount = "COUNT(" + sqlFieldName + ") AS count " # run a quick pre-query to see the total number of results returned if not db.runQuery("SELECT " + sqlCount + " FROM " + sqlTable + " " + buildSqlWhereClause(sqlWheres)): continue totalRecords = db.cursor.fetchone()[0] # add the .0 after totalrecords so that float division is performed sqlField = ( sqlFieldName + ", " + sqlCount + ", (COUNT(" + sqlFieldName + ")/" + str(totalRecords) + ".0) AS percent" )