def db_finish(self): now = time.time() self.db.update('scan', where="id = "+str(self.id), triples=self.c, time_complete=now, status=1) for (term, use) in (self.term_uses.keys()): (ns, local) = splitter.split(term) nsid = irimap.to_id(self.db, ns) self.db.insert('term_use', local=local, namespace_id = nsid, scan_id = self.id, type = use, count = self.term_uses[(term,use)] ) for t in self.primary_trackers: self.db.insert('trackers', scan_id = self.id, tracker_id = irimap.to_id(self.db, t), is_primary = True ) for t in self.backup_trackers: self.db.insert('trackers', scan_id = self.id, tracker_id = irimap.to_id(self.db, t), is_primary = False ) obsolete_old_scans(self.db, self.data_source_iri, self.id)
def list_(term, timecode=(-1), db=None): # , limit, offset): """ see http://dev.mysql.com/doc/refman/5.0/en/select.html for us, timecodes ARE scanids. A new scanid == a new change. are we going to need to SORT when we do limit/offset? pagerank? """ if db is None: db = dbconn.Connection() (ns, local) = splitter.split(term) ns_id = irimap.to_id(db, ns) if timecode == -1: timecode = latest_timecode(db) print "list using latest timecode:", timecode else: if timecode < min_timecode(db): raise GarbageTimecode() for r in db.query('select text, type from term_use, scan, iri where scan_id <= $timecode and obsoleted_by > $timecode and namespace_id=$ns_id and scan.id=scan_id and status=1 and local=$local and iri.id=source_id', vars=locals()): yield unicode(r.type)+" "+unicode(r.text)
def obsolete_old_scans(db, source, scan_id): source_id = irimap.to_id(db, source) max = max_timecode db.update('scan', where='id < $scan_id and source_id = $source_id and obsoleted_by = $max', obsoleted_by=scan_id, vars=locals())
def db_show(db, source, ns): good = get_latest_scan(db, source) scan_id = good.id ns_id = irimap.to_id(db, ns) print `good` for r in db.select('term_use', where='scan_id=$scan_id and namespace_id=$ns_id', vars=locals()): print r.count, r.local
def db_start(self): debug('scan', 'database connection started') self.source_id = irimap.to_id(self.db, self.data_source_iri) self.id = self.db.insert('scan', source_id=self.source_id, time_begun=self.start, triples=0, last_modified=self.last_modified, status=0, # or just use time_completed? obsoleted_by=max_timecode ) debug('scan', 'database record created', self.id)
def report(source, ns): """ Return a report [in std format?] of the given source, those entries in the given namespace """ db = dbconn.Connection() scan = ensure_scanned(db, source) scan_id = scan.id ns_id = irimap.to_id(db, ns) results = db.select('term_use', where="scan_id=$scan_id and namespace_id=$ns_id", vars=locals()) out = u"" for r in results: out += "%d %s %s\n" % (r.count, r.type, r.local) del db return out
def get_latest_scan(db, source, all_scan_ids=None): '''Return a record of the latest completed scan of this source. If an array all_scan_ids is provided, all the scan ids will be appended to it. ''' source_id = irimap.to_id(db, source) max_good_id = -1 if all_scan_ids is None: all_scan_ids = [] for r in db.select('scan', where='source_id=$source_id', vars=locals()): if r.status == 1 and r.id > max_good_id: max_good_id = r.id rr = copy.deepcopy(r) all_scan_ids.append(r.id) if max_good_id > -1: return rr else: raise NoGoodScan()