예제 #1
0
파일: consistent.py 프로젝트: ml-lab/cavat
    def checkDocument(self, doc_id):

        docName = self.startup(doc_id)
        if not docName:
            return False

    # use an agenda-based closure algorithm.
    #   - find all intervals referenced by tlinks
    #   - split these into a start and end, and for each interval, add interval_start < interval_end to the database (which assumes we have only annotated proper intervals).
    #   - for each tlink, add axioms about the points that they link to the agenda
    #   - perform agenda based closure; before any agenda addition, check that the rule being added does not conflict with other rules.
    #     a conflict is when we assign a new value for a pair that is already present; e.g., a<b conflicts with b=a, or b<a, or a=b
    #   - if we ever find a conflict, then the graph is inconsistent.
    #   - if we empty the agenda, the graph is consistent.

        if not runQuery('SELECT DISTINCT arg1 FROM tlinks WHERE doc_id = ' +
                        doc_id):
            return

        intervals = set(db.cursor.fetchall())

        if not runQuery('SELECT DISTINCT arg2 FROM tlinks WHERE doc_id = ' +
                        doc_id):
            return

        intervals = intervals.union(set(db.cursor.fetchall()))

        # move db result from list of tuples to list of strings
        intervals_ = []
        for interval in intervals:
            intervals_.append(interval[0])
        intervals = set(intervals_)

        # fetch tlinks
        if not runQuery(
                'SELECT arg1, reltype, arg2, lid FROM tlinks WHERE doc_id = ' +
                doc_id):
            return

        tlinks = db.cursor.fetchall()

        result = self.consistencyCheck(intervals, tlinks)

        if result:
            if cavatDebug.debug:
                print 'Consistent'

            if self.superVerbose:
                print self.database

            return True

        else:

            # only print doc name if it's not already there
            if not cavatDebug.debug:
                print "# Checking " + docName + ' (id ' + doc_id + ')'

            print '! ' + self.failReason
            return False
예제 #2
0
파일: consistent.py 프로젝트: ml-lab/cavat
    def checkDocument(self,  doc_id):
        
        docName = self.startup(doc_id)
        if not docName:
            return False        
        
    # use an agenda-based closure algorithm.
    #   - find all intervals referenced by tlinks
    #   - split these into a start and end, and for each interval, add interval_start < interval_end to the database (which assumes we have only annotated proper intervals).
    #   - for each tlink, add axioms about the points that they link to the agenda
    #   - perform agenda based closure; before any agenda addition, check that the rule being added does not conflict with other rules. 
    #     a conflict is when we assign a new value for a pair that is already present; e.g., a<b conflicts with b=a, or b<a, or a=b
    #   - if we ever find a conflict, then the graph is inconsistent.
    #   - if we empty the agenda, the graph is consistent.

    
        if not runQuery('SELECT DISTINCT arg1 FROM tlinks WHERE doc_id = ' + doc_id):
            return
        
        intervals = set(db.cursor.fetchall())
        
        if not runQuery('SELECT DISTINCT arg2 FROM tlinks WHERE doc_id = ' + doc_id):
            return
        
        intervals = intervals.union(set(db.cursor.fetchall()))
        
        # move db result from list of tuples to list of strings
        intervals_ = []
        for interval in intervals:
            intervals_.append(interval[0])
        intervals = set(intervals_)
        
        
        # fetch tlinks
        if not runQuery('SELECT arg1, reltype, arg2, lid FROM tlinks WHERE doc_id = ' + doc_id):
            return
        
        tlinks = db.cursor.fetchall()
        
        result = self.consistencyCheck(intervals,  tlinks)
        
        if result:
            if cavatDebug.debug:
                print 'Consistent'
                
            if self.superVerbose:
                print self.database
                
            return True
            
        else:
            
            # only print doc name if it's not already there
            if not cavatDebug.debug:
                print "# Checking " + docName + ' (id ' + doc_id + ')'
                
            print '! ' + self.failReason
            return False
예제 #3
0
파일: tlink_loop.py 프로젝트: ml-lab/cavat
    def checkDocument(self, doc_id):

        docName = self.startup(doc_id)
        if not docName:
            return False

        loopedTlinks = []

        # look at where we're linking an event instance to itself
        if not runQuery(
            "SELECT lid, relType, arg1, arg2 FROM tlinks WHERE arg1 = arg2 AND doc_id = "
            + doc_id
            + " ORDER BY CAST(SUBSTR(lid,2) AS SIGNED)"
        ):
            return

        loopedTlinks = db.cursor.fetchall()

        # look at where we're linking an event instance to itself
        if not runQuery(
            "SELECT lid, reltype, arg1, arg2 FROM tlinks AS t, instances AS i1, instances AS i2 WHERE t.arg1 = i1.eiid AND t.arg2 = i2.eiid AND t.doc_id = i1.doc_id AND t.doc_id = i2.doc_id AND i1.eventID = i2.eventID AND t.doc_id = "
            + doc_id
            + " ORDER BY CAST(SUBSTR(lid,2) AS SIGNED)"
        ):
            return

        loopedTlinks = self.uniq(loopedTlinks + db.cursor.fetchall())

        if loopedTlinks:
            print "# Checking " + docName + " (id " + doc_id + ")"

            for row in loopedTlinks:
                if row[2] == row[3]:
                    print "TLINK ID %s loops directly (instanceID match), type %s, event %s / %s" % (
                        row[0],
                        row[1],
                        row[2],
                        row[3],
                    )
                else:
                    print "TLINK ID %s may be a loop (eventID match), type %s, event %s / %s - check document manually" % (
                        row[0],
                        row[1],
                        row[2],
                        row[3],
                    )

            return False

        else:
            if cavatDebug.debug:
                print "No looping TLINKs found in this document."

            return True
예제 #4
0
    def getTlinks(self, doc_id):
        if not runQuery('SELECT arg1, arg2 FROM tlinks WHERE doc_id = ' +
                        str(doc_id)):
            return

        tlinks = db.cursor.fetchall()
        return tlinks
예제 #5
0
    def checkDocument(self, doc_id):

        docName = self.startup(doc_id)
        if not docName:
            return False

        loopedTlinks = []

        # look at where we're linking an event instance to itself
        if not runQuery(
                'SELECT lid, relType, arg1, arg2 FROM tlinks WHERE arg1 = arg2 AND doc_id = '
                + doc_id + ' ORDER BY CAST(SUBSTR(lid,2) AS SIGNED)'):
            return

        loopedTlinks = db.cursor.fetchall()

        # look at where we're linking an event instance to itself
        if not runQuery(
                'SELECT lid, reltype, arg1, arg2 FROM tlinks AS t, instances AS i1, instances AS i2 WHERE t.arg1 = i1.eiid AND t.arg2 = i2.eiid AND t.doc_id = i1.doc_id AND t.doc_id = i2.doc_id AND i1.eventID = i2.eventID AND t.doc_id = '
                + doc_id + ' ORDER BY CAST(SUBSTR(lid,2) AS SIGNED)'):
            return

        loopedTlinks = self.uniq(loopedTlinks + db.cursor.fetchall())

        if loopedTlinks:
            print "# Checking " + docName + ' (id ' + doc_id + ')'

            for row in loopedTlinks:
                if row[2] == row[3]:
                    print 'TLINK ID %s loops directly (instanceID match), type %s, event %s / %s' % (
                        row[0], row[1], row[2], row[3])
                else:
                    print 'TLINK ID %s may be a loop (eventID match), type %s, event %s / %s - check document manually' % (
                        row[0], row[1], row[2], row[3])

            return False

        else:
            if cavatDebug.debug:
                print 'No looping TLINKs found in this document.'

            return True
예제 #6
0
파일: cavatModule.py 프로젝트: ml-lab/cavat
 def startup(self,  doc_id):
     
     if not runQuery('SELECT docname FROM documents WHERE id = ' + doc_id):
         # document not found
         print '! No document in corpus with id ' + doc_id
         return False
     
     results = db.cursor.fetchone()
     
     docName = str(results[0])
     
     if cavatDebug.debug:
         print "# Checking " + docName + ' (id ' + doc_id + ')'
     
     return docName
예제 #7
0
파일: orphans.py 프로젝트: ml-lab/cavat
    def checkDocument(self, doc_id):

        docName = self.startup(doc_id)
        if not docName:
            return False

        orphans = set()

        # orphan cases:
        #  timex3 not in a link
        #  instance not in a link
        #  event not got an instance
        #  instance not got an event
        #  signals not referenced by any tlink or instance
        #  tlink referencing non-existant signal

        # build a list of intervals already in tlinks

        linkedIntervals = set()

        queries = [
            'SELECT DISTINCT arg2 FROM tlinks WHERE doc_id = ',
            'SELECT DISTINCT arg1 FROM tlinks WHERE doc_id = ',
            'SELECT DISTINCT eventInstanceID FROM slinks WHERE doc_id = ',
            'SELECT DISTINCT subordinatedEventInstance FROM slinks WHERE doc_id = ',
            'SELECT DISTINCT eventInstanceID FROM alinks WHERE doc_id = ',
            'SELECT DISTINCT relatedToEventInstance FROM alinks WHERE doc_id = '
        ]
        for query in queries:
            if not runQuery(query + doc_id):
                return
            args = list(db.cursor.fetchall())
            for arg in args:
                linkedIntervals.add(arg[0])

        # first is easiest - check for timex3s that aren't mentioned in a tlink
        if not runQuery('SELECT tid FROM timex3s WHERE doc_id = ' + doc_id):
            return

        timex3s = db.cursor.fetchall()

        for timex3 in timex3s:
            if timex3[0] not in linkedIntervals:
                orphans.add('TIMEX3 ' + str(timex3[0]) + ' not in any link')

        # next, check for instances not in tlinks
        if not runQuery('SELECT eiid FROM instances WHERE doc_id = ' + doc_id):
            return

        instances = set()
        for instance_ in list(db.cursor.fetchall()):
            instances.add(instance_[0])

        for missing in instances.difference(linkedIntervals):
            orphans.add('INSTANCE ' + str(missing) + ' not in any link')

        # then, find events that don't have any instances
        if not runQuery('SELECT eid FROM events WHERE doc_id = ' + doc_id):
            return

        events = set()
        for event_ in list(db.cursor.fetchall()):
            events.add(event_[0])

        if not runQuery(
                'SELECT DISTINCT eventID FROM instances WHERE doc_id = ' +
                doc_id):
            return

        instancedEvents = set()
        for instance_ in list(db.cursor.fetchall()):
            instancedEvents.add(instance_[0])

        for missing in events.difference(instancedEvents):
            orphans.add('EVENT ' + str(missing) + ' is never instanced')

        # instances that don't reference an event, or reference an invalid event
        # (instances where eventID == '')  union  (instancedEvents [eventID] minus events [eid])
        for missing in instancedEvents.difference(events):
            orphans.add('INSTANCE' + str(missing) +
                        ' references absent eventID')

        if not runQuery(
                'SELECT eiid FROM instances WHERE (eventID = "" OR eventID IS NULL) AND doc_id = '
                + doc_id):
            return

        for instance_ in list(db.cursor.fetchall()):
            orphans.add('INSTANCE ' + str(instance_[0]) +
                        ' does not reference an event')

        # signals not referenced by any tlink, alink, slink or instance
        if not runQuery(
                'SELECT sid FROM signals WHERE doc_id = %s AND sid NOT IN (SELECT signalID FROM tlinks WHERE doc_id = %s AND signalID IS NOT NULL) AND sid NOT IN (SELECT signalID FROM slinks WHERE doc_id = %s AND signalID IS NOT NULL) AND sid NOT IN (SELECT signalID FROM alinks WHERE doc_id = %s AND signalID IS NOT NULL) AND sid NOT IN (SELECT signalID FROM instances WHERE doc_id = %s AND signalID IS NOT NULL)'
                % (doc_id, doc_id, doc_id, doc_id, doc_id)):
            return

        for sid_ in list(db.cursor.fetchall()):
            orphans.add(
                'SIGNAL ' + str(sid_[0]) +
                ' is not referenced by any TLINK, SLINK, ALINK or MAKEINSTANCE'
            )

        # tlink referencing non-existant signal
        if not runQuery(
                'SELECT lid FROM tlinks WHERE signalID != "" AND signalID NOT IN (SELECT sid FROM signals WHERE doc_id = %s) AND doc_id = %s'
                % (doc_id, doc_id)):
            return

        for lid_ in list(db.cursor.fetchall()):
            orphans.add(
                'TLINK ' + str(lid_[0]) +
                ' references a signal that is not included in the annotation')

        if len(orphans) > 0:

            print "# Checking " + docName + ' (id ' + doc_id + ')'

            orphans = list(orphans)
            orphans.sort()

            for orphan in orphans:
                print orphan

            return False

        else:
            return True
예제 #8
0
파일: split_graph.py 프로젝트: ml-lab/cavat
 def getTlinks(self, doc_id):
     if not runQuery('SELECT arg1, arg2 FROM tlinks WHERE doc_id = ' + str(doc_id)):
         return 
     
     tlinks = db.cursor.fetchall()
     return tlinks
예제 #9
0
파일: orphans.py 프로젝트: ml-lab/cavat
    def checkDocument(self,  doc_id):
        
        docName = self.startup(doc_id)
        if not docName:
            return False

        
        orphans = set()
        
        # orphan cases:
        #  timex3 not in a link
        #  instance not in a link
        #  event not got an instance
        #  instance not got an event
        #  signals not referenced by any tlink or instance
        #  tlink referencing non-existant signal

        # build a list of intervals already in tlinks

        linkedIntervals = set()
        
        queries = ['SELECT DISTINCT arg2 FROM tlinks WHERE doc_id = ',  'SELECT DISTINCT arg1 FROM tlinks WHERE doc_id = ',  'SELECT DISTINCT eventInstanceID FROM slinks WHERE doc_id = ',  'SELECT DISTINCT subordinatedEventInstance FROM slinks WHERE doc_id = ',  'SELECT DISTINCT eventInstanceID FROM alinks WHERE doc_id = ',  'SELECT DISTINCT relatedToEventInstance FROM alinks WHERE doc_id = ']
        for query in queries:
            if not runQuery(query + doc_id):
                return
            args = list(db.cursor.fetchall())
            for arg in args:
                linkedIntervals.add(arg[0])



        # first is easiest - check for timex3s that aren't mentioned in a tlink
        if not runQuery('SELECT tid FROM timex3s WHERE doc_id = ' + doc_id):
            return
        
        timex3s = db.cursor.fetchall()
        
        
        for timex3 in timex3s:
            if timex3[0] not in linkedIntervals:
                orphans.add('TIMEX3 ' + str(timex3[0]) + ' not in any link')
        
        
        # next, check for instances not in tlinks
        if not runQuery('SELECT eiid FROM instances WHERE doc_id = ' + doc_id):
            return
        
        instances = set()
        for instance_ in list(db.cursor.fetchall()):
            instances.add(instance_[0])
        
        for missing in instances.difference(linkedIntervals):
            orphans.add('INSTANCE ' + str(missing) + ' not in any link')
        
        
        # then, find events that don't have any instances
        if not runQuery('SELECT eid FROM events WHERE doc_id = ' + doc_id):
            return
        
        events = set()
        for event_ in list(db.cursor.fetchall()):
            events.add(event_[0])
        
        if not runQuery('SELECT DISTINCT eventID FROM instances WHERE doc_id = ' + doc_id):
            return
        
        instancedEvents = set()
        for instance_ in list(db.cursor.fetchall()):
            instancedEvents.add(instance_[0])
        
        for missing in events.difference(instancedEvents):
            orphans.add('EVENT ' + str(missing) + ' is never instanced')
        
        
        # instances that don't reference an event, or reference an invalid event
        # (instances where eventID == '')  union  (instancedEvents [eventID] minus events [eid])
        for missing in instancedEvents.difference(events):
            orphans.add('INSTANCE' + str(missing) + ' references absent eventID')
        
        if not runQuery('SELECT eiid FROM instances WHERE (eventID = "" OR eventID IS NULL) AND doc_id = ' + doc_id):
            return
        
        for instance_ in list(db.cursor.fetchall()):
            orphans.add('INSTANCE ' + str(instance_[0]) + ' does not reference an event')


        # signals not referenced by any tlink, alink, slink or instance
        if not runQuery('SELECT sid FROM signals WHERE doc_id = %s AND sid NOT IN (SELECT signalID FROM tlinks WHERE doc_id = %s AND signalID IS NOT NULL) AND sid NOT IN (SELECT signalID FROM slinks WHERE doc_id = %s AND signalID IS NOT NULL) AND sid NOT IN (SELECT signalID FROM alinks WHERE doc_id = %s AND signalID IS NOT NULL) AND sid NOT IN (SELECT signalID FROM instances WHERE doc_id = %s AND signalID IS NOT NULL)' % (doc_id,  doc_id, doc_id,  doc_id, doc_id)):
            return
        
        for sid_ in list(db.cursor.fetchall()):
            orphans.add('SIGNAL ' + str(sid_[0]) + ' is not referenced by any TLINK, SLINK, ALINK or MAKEINSTANCE')
        

        # tlink referencing non-existant signal
        if not runQuery('SELECT lid FROM tlinks WHERE signalID != "" AND signalID NOT IN (SELECT sid FROM signals WHERE doc_id = %s) AND doc_id = %s' % (doc_id, doc_id) ):
            return

        for lid_ in list(db.cursor.fetchall()):
            orphans.add('TLINK ' + str(lid_[0]) + ' references a signal that is not included in the annotation')
        
        
        if len(orphans) > 0:

            print "# Checking " + docName + ' (id ' + doc_id + ')'
            
            orphans = list(orphans)
            orphans.sort()
            
            for orphan in orphans:
                print orphan
            
            return False
            
        else:
            return True
예제 #10
0
                    sqlWheres.append(conditionField +' IS NULL')
                    whereCaption = ' when ' + conditionFieldName + ' is not defined'

        # process report type
        # a list report just shows the values as they are, without any accompanying data
        if t.report == 'list':
            pass
            
        elif t.report == 'distribution':
            # build a distribution report. here we will show unique values for a field, as well as their frequency in the selected corpus, showing most frequent first.
            # would be great to add a percentage column
            sqlGroup = ' GROUP BY ' + sqlFieldName
            sqlCount = 'COUNT(' + sqlFieldName + ') AS count '
            
            # run a quick pre-query to see the total number of results returned
            if not db.runQuery('SELECT '+ sqlCount + ' FROM ' + sqlTable + ' ' + buildSqlWhereClause(sqlWheres)):
                continue
            
            totalRecords = db.cursor.fetchone()[0]
            # add the .0 after totalrecords so that float division is performed
            sqlField = sqlFieldName + ', ' + sqlCount + ', (COUNT('+sqlFieldName+')/'+str(totalRecords)+'.0) AS percent'

            # if we are generating a report about a numeric value, sort the table by that value, not by frequency; this way round, it's easier to spot lumps / import into a histogram
            if (sqlTable + '.' + sqlFieldName).lower() in numericFields:
                sqlOrder = ' ORDER BY ' + sqlFieldName + ' ASC'
            else:
                sqlOrder = ' ORDER BY count DESC'
            
        # state is either "filled" or "unfilled", showing whether or not the attributed has been specified
        elif t.report == 'state':
            
예제 #11
0
파일: cavat.py 프로젝트: ml-lab/cavat
                    sqlWheres.append(conditionField + " IS NULL")
                    whereCaption = " when " + conditionFieldName + " is not defined"

        # process report type
        # a list report just shows the values as they are, without any accompanying data
        if t.report == "list":
            pass

        elif t.report == "distribution":
            # build a distribution report. here we will show unique values for a field, as well as their frequency in the selected corpus, showing most frequent first.
            # would be great to add a percentage column
            sqlGroup = " GROUP BY " + sqlFieldName
            sqlCount = "COUNT(" + sqlFieldName + ") AS count "

            # run a quick pre-query to see the total number of results returned
            if not db.runQuery("SELECT " + sqlCount + " FROM " + sqlTable + " " + buildSqlWhereClause(sqlWheres)):
                continue

            totalRecords = db.cursor.fetchone()[0]
            # add the .0 after totalrecords so that float division is performed
            sqlField = (
                sqlFieldName
                + ", "
                + sqlCount
                + ", (COUNT("
                + sqlFieldName
                + ")/"
                + str(totalRecords)
                + ".0) AS percent"
            )