def _largeSmallSplit(self, pids: list, index: int, large: Application, small: Application, inserting: Application): """Split overlapping apps where one covers the other's lifecycle.""" (left, right) = large.split(beforeEnd=small.getTimeOfStart() - 1, afterStart=small.getTimeOfEnd() + 1) if inserting == large: pids.insert(index + 1, right) pids.insert(index, left) else: pids[index] = left pids.insert(index + 1, right) pids.insert(index + 1, small) return pids
def loadDb(self, store: ApplicationStore = None): """Browse the SQLite db and create all the relevant app instances.""" # Load up our events from the Zeitgeist database self.cur = self.con.cursor() self.cur.execute('SELECT * \ FROM event_view \ WHERE id IN (SELECT DISTINCT id \ FROM event_view \ WHERE subj_uri LIKE "activity://%")') # Merge all event subjects based on their event id, and find their pids eventsMerged = dict() data = self.cur.fetchone() while data: pid = 0 if "pid://" in data[EV_SUBJ_URI]: m = re.search('(?<=pid://)\d+', data[EV_SUBJ_URI]) pid = int(m.group(0)) if m else 0 ev = eventsMerged.get(data[EV_ID]) if not ev: ev = SqlEvent(id=data[EV_ID], pid=pid, timestamp=data[EV_TIMESTAMP], interpretation=self.getInterpretation( data[EV_INTERPRETATION]), manifestation=self.getManifestation( data[EV_MANIFESTATION]), origin_uri=data[EV_EVENT_ORIGIN_URI], actor_uri=data[EV_ACTOR_URI]) elif pid and ev.pid: assert ev.pid == pid, ("Error: multiple events record a pid " " event %d, and they disagree on the " "pid to record (%d != %d)." % ( data[EV_ID], ev.pid, pid)) elif pid and not ev.pid: ev.pid = pid subj = SqlEventSubject(uri=data[EV_SUBJ_URI], interpretation=self.getInterpretation( data[EV_SUBJ_INTERPRETATION]), manifestation=self.getManifestation( data[EV_SUBJ_MANIFESTATION]), origin_uri=data[EV_SUBJ_ORIGIN_URI], mimetype=self.getMimeType( data[EV_SUBJ_MIMETYPE]), text=data[EV_SUBJ_TEXT], storage_uri=data[EV_SUBJ_STORAGE], current_uri=data[EV_SUBJ_CURRENT_URI]) ev.addSubject(subj) eventsMerged[data[EV_ID]] = ev data = self.cur.fetchone() # Now, sort the events per app PID so we can build apps nopids = [] # Matching events without a PID eventsPerPid = dict() # Storage for our events count = len(eventsMerged) # Counter of fetched events, for stats instanceCount = 0 # Count of distinct app instances in the dataset actors = set() for event in eventsMerged.items(): pid = event[1].pid if not pid: nopids.append(event[1]) else: try: eventsPerPid[pid].append(event[1]) except KeyError as e: eventsPerPid[pid] = [event[1]] del eventsMerged # no longer needed # For each PID, we'll now identify the successive Application instances for (pkey, pevent) in eventsPerPid.items(): pevent = sorted(pevent, key=lambda x: x.timestamp) currentId = '' # currently matched Desktop Id currentApp = None # currently matched Application apps = [] # temp storage for found Applications for ev in pevent: (evId, __) = Application.getDesktopIdFromDesktopUri( ev.actor_uri) if evId != currentId: if debugEnabled(): print ("New application:", evId, currentId, ev) currentId = evId currentApp = Application(desktopid=evId, pid=int(pkey), tstart=ev.timestamp, tend=ev.timestamp) actors.add(currentApp.desktopid) apps.append(currentApp) else: currentApp.setTimeOfStart(min(ev.timestamp, currentApp.getTimeOfStart())) currentApp.setTimeOfEnd(max(ev.timestamp, currentApp.getTimeOfEnd())) # Ignore study artefacts! if not currentApp.isStudyApp(): event = Event(actor=currentApp, time=ev.timestamp, zgEvent=ev) currentApp.addEvent(event) # Insert into the ApplicationStore if one was given to us instanceCount += len(apps) if store is not None: for app in apps: # Ignore study artefacts! if not app.isStudyApp(): store.insert(app) else: instanceCount -= 1 # We discount this app instance self.appCount = len(actors) self.instCount = instanceCount self.eventCount = count self.validEventRatio = 100-100*len(nopids) / count print("Finished loading DB.\n%d events seen, %d normal, %d without a " "PID.\nIn total, %.02f%% events accepted." % ( count, count-len(nopids), len(nopids), self.validEventRatio)) print("Instance count: %d" % self.instCount)
def insert(self, app: Application): """Insert an Application in the store.""" if app.pid == 0: raise ValueError("Applications must have a valid PID.") if not app.desktopid: raise ValueError("Applications must have a Desktop identifier.") tstart = app.getTimeOfStart() tend = app.getTimeOfEnd() if tstart > tend: raise ValueError("Applications must have valid times of start and " "end.") # Get the list of instances for this PID, and find this app's place. pids = self.pidStore.get(app.pid, list()) # type: list neighbourCheckupIndex = -1 for (index, bpp) in enumerate(pids): bstart = bpp.getTimeOfStart() bend = bpp.getTimeOfEnd() # other item before ours, keep moving if (bend < tstart): continue # other item after ours, we found our position if (bstart > tend): pids.insert(index, app) neighbourCheckupIndex = index break # time period conflict, merge apps if same id or alert of a problem if (bend >= tstart) or (bstart <= tend): if app.hasSameDesktopId(bpp, resolveInterpreter=True): bpp.merge(app) pids[index] = bpp neighbourCheckupIndex = index else: # Apps A (which we insert) and B (which we compare to) are # overlapping. We now determine their respective orders to # dispatch them to the appropriate app splitting algorithm. print("Warning: Applications %s and %s overlap on PID %d" % (app.desktopid, bpp.desktopid, app.pid), file=sys.stderr) pids = self.dispatchSplit(pids, index, app, bpp) # Now, merge the inserted app with neighbours if applicable # but note that we don't really know where it is, how many # times it was split, and how much the list has grown. Even # if we pulled that info from the split functions, doing # merges on both edges of the newly inserted sequence would # be more complicated (thus error-prone) than browsing the # whole (short) list of pids. So let's keep it fool-proof. pids = self._mergePidList(pids) # raise ValueError("Applications %s and %s have the same " # "PID (%d) and their runtimes overlap:\n" # "\t%s \t %s\n\t%s \t %s\nbut they have " # "different identities. This is a bug " # "in the collected data." % ( # app.desktopid, # bpp.desktopid, # app.pid, # time2Str(app.getTimeOfStart()), # time2Str(app.getTimeOfEnd()), # time2Str(bpp.getTimeOfStart()), # time2Str(bpp.getTimeOfEnd()))) break # app is the last item on the list! else: pids.append(app) # Now, we check if the neighbours to the newly inserted Application # have the same Desktop ID. If they do, and if they are within a given # proximity window, we merge the items. This is needed to help Events # from Zeitgeist and PreloadLogger to synchronise. if neighbourCheckupIndex >= 0: pids = self._mergePidItem(pids, neighbourCheckupIndex) self.pidStore[app.getPid()] = pids self.nameStoreClean = False
def dispatchSplit(self, pids: list, index: int, app: Application, bpp: Application): """Decide how to split two Applications based on how they overlap.""" tstart = app.getTimeOfStart() tend = app.getTimeOfEnd() bstart = bpp.getTimeOfStart() bend = bpp.getTimeOfEnd() # First overlap condition, with all ramifications. if bend >= tstart: # B ends after A. if bend >= tend: # A is embedded into B, so we split B. if bstart <= tstart: pids = self._largeSmallSplit(pids, index, large=bpp, small=app, inserting=app) # B starts during A, ends after A. We must split. else: pids = self._mixedSplit(pids, index, before=app, after=bpp, inserting=app) # A ends after B. else: # B is embedded into A, so we split A. if bstart >= tstart: pids = self._largeSmallSplit(pids, index, large=app, small=bpp, inserting=app) # A starts during B, ends after B. We must split. else: pids = self._mixedSplit(pids, index, before=bpp, after=app, inserting=app) # Second overlap condition, with all ramifications. elif bstart <= tend: if bstart <= tstart: if bend >= tend: pids = self._largeSmallSplit(pids, index, large=bpp, small=app, inserting=app) else: pids = self._mixedSplit(pids, index, before=bpp, after=app, inserting=app) else: if bend <= tend: pids = self._largeSmallSplit(pids, index, large=app, small=bpp, inserting=app) else: pids = self._mixedSplit(pids, index, before=app, after=bpp, inserting=app) return pids