def __init__(self, basedir, doctype, encoding='utf-8', indexstyle=None, threshold=100, verbose=False): self.basedir = basedir self.threshold = threshold self.verbose = verbose self._corpus = self.GzipTarDBCorpusWithLabel( os.path.join(basedir, 'src'), doctype, encoding, indexstyle=indexstyle) self._artdb = FixedDB(os.path.join(basedir, 'articles')) self._indexdb = IndexDB(os.path.join(basedir, 'idx'), 'idx') self._loctoindex = None self._mode = None return
def __init__(self, basedir, doctype, encoding='utf-8', indexstyle=None, threshold=100, verbose=False): self.basedir = basedir self.threshold = threshold self.verbose = verbose self._corpus = self.GzipTarDBCorpusWithLabel(os.path.join( basedir, 'src'), doctype, encoding, indexstyle=indexstyle) self._artdb = FixedDB(os.path.join(basedir, 'articles')) self._indexdb = IndexDB(os.path.join(basedir, 'idx'), 'idx') self._loctoindex = None self._mode = None return
class TarCMS(object): """Content Management with tar files Sample usage: # Create a TarCMS object. cms = TarCMS(basedir, doctype) # Actually create the structure on disk. cms.create() # Open it. cms.open(mode='w') # Add a new document. aid = cms.create_article('this is my text.') # Modify the document. tid = cms.modify_article(aid, 'this is my revised text.') # Search all documents. for (tid,mtime,title,snippet) in cms.find_snapshots(queries): data = cms.get_data(tid) # Retrieve all revisions of an article: for tid in cms.get_article(aid): data = cms.get_data(tid) # Close it. cms.close() # Check the validity of the metadata. cms.validate() # Recover the metadata. cms.recover() """ class GzipTarDBCorpusWithLabel(GzipTarDBCorpus): def loc_labels(self, loc): info = GzipTarDBCorpus.get_info(self, loc) name = info.name[8:] if name: return [name] return [] class TarCMSError(Exception): pass class ArticleNotFound(TarCMSError): pass def __init__(self, basedir, doctype, encoding='utf-8', indexstyle=None, threshold=100, verbose=False): self.basedir = basedir self.threshold = threshold self.verbose = verbose self._corpus = self.GzipTarDBCorpusWithLabel( os.path.join(basedir, 'src'), doctype, encoding, indexstyle=indexstyle) self._artdb = FixedDB(os.path.join(basedir, 'articles')) self._indexdb = IndexDB(os.path.join(basedir, 'idx'), 'idx') self._loctoindex = None self._mode = None return def __repr__(self): return '<TarCMS: basedir=%r>' % (self.basedir,) def __iter__(self): return self.list_articles() def create(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.create() self._artdb.create(9) self._indexdb.create() return def open(self, mode='r'): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.open(mode=mode) self._artdb.open(mode=mode) self._indexdb.open() self._loctoindex = set() self._mode = mode return def close(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) self.flush() self._corpus.close() self._artdb.close() self._indexdb.close() self._mode = None return def _add_corpus(self, info, data): assert self._mode is not None tid = self._corpus.add_data(info, data) self._loctoindex.add(tid) if self.threshold and self.threshold <= len(self._loctoindex): self.flush() return tid def _add_file(self, info, path): assert self._mode is not None fp = file(path, 'rb') data = fp.read() fp.close() return self._add_corpus(info, data) def flush(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) self._corpus.flush() self._artdb.flush() indexer = Indexer(self._indexdb, self._corpus, verbose=self.verbose) for tid in self._loctoindex: indexer.index_loc(tid) indexer.finish() self._loctoindex.clear() return def create_article(self, data, info=None): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) if info is None: info = TarInfo() assert isinstance(info, TarInfo) aid = '%08x' % self._artdb.nextrecno() info.name = aid+info.name tid = self._add_corpus(info, data) assert aid == tid self._artdb.add_record(tid) return aid def modify_article(self, aid, data, info=None): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) tid0 = self._artdb.get_record(int(aid, 16)) if info is None: info = self.get_info(tid0) assert isinstance(info, TarInfo) info.name = aid+info.name loc = self._add_corpus(info, data) tid = '%08x' % self._artdb.add_record(tid0) assert loc == tid self._artdb.set_record(int(aid, 16), tid) return tid def list_snapshots(self, aid=None): """Get all revisions of an article.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) if aid is None: for tid in self._artdb: yield tid else: try: tid = self._artdb.get_record(int(aid, 16)) except FixedDB.InvalidRecord: raise TarCMS.ArticleNotFound(aid) while aid != tid: yield tid tid = self._artdb.get_record(int(tid, 16)) yield tid return def list_articles(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) for (aid,tid) in enumerate(self._artdb): aid = '%08x' % aid if aid == tid: yield aid return def find_snapshots(self, preds, disjunctive=False): """Find snapshots that match to the predicates.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) sel = Selection(self._indexdb, preds, disjunctive=disjunctive) for x in sel: yield sel.get_snippet(x) return def find_articles(self, preds, disjunctive=False): sel = self.find_snapshots(preds, disjunctive=disjunctive) aids = set() for (tid, mtime, title, snippet) in sel: try: aid = self._artdb.get_record(int(tid, 16)) except FixedDB.InvalidRecord: raise TarCMS.ArticleNotFound(tid) if aid not in aids: aids.add(aid) yield (aid, mtime, title, snippet) return def get_info(self, tid): """Get the information about the snapshot specified by tid.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) info = self._corpus.get_info(tid) info.name = info.name[8:] return info def get_data(self, tid): """Get a particular revision of article specified by tid.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) return self._corpus.get_data(tid) def get_latest(self, aid): """Equivalent to self.list_snapshots(aid)[0].""" for tid in self.list_snapshots(aid): return tid raise KeyError(aid) def _get_tids(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.open(mode='r') tids = [] for tid in self._corpus.get_all_locs(): info = self._corpus.get_info(tid) aid = info.name[:8] if tid == aid: tids.append(tid) else: i = int(aid, 16) tids.append(tids[i]) tids[i] = tid self._corpus.close() return tids def validate_artdb(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._artdb.open(mode='r') for (entry,tid) in ezip(self._artdb, self._get_tids()): if entry != tid: raise TarCMS.TarCMSError self._artdb.close() return def recover_artdb(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._artdb.open(mode='w') for tid in self._get_tids(): self._artdb.add_record(tid) self._artdb.close() return def validate(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.validate_catalog() self.validate_artdb() return def recover(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.recover_catalog() self.recover_artdb() self._indexdb.reset() indexer = Indexer(self._indexdb, self._corpus, verbose=verbose) for tid in self._corpus.get_all_locs(): indexer.index_loc(tid) indexer.finish() return
class TarCMS(object): """Content Management with tar files Sample usage: # Create a TarCMS object. cms = TarCMS(basedir, doctype) # Actually create the structure on disk. cms.create() # Open it. cms.open(mode='w') # Add a new document. aid = cms.create_article('this is my text.') # Modify the document. tid = cms.modify_article(aid, 'this is my revised text.') # Search all documents. for (tid,mtime,title,snippet) in cms.find_snapshots(queries): data = cms.get_data(tid) # Retrieve all revisions of an article: for tid in cms.get_article(aid): data = cms.get_data(tid) # Close it. cms.close() # Check the validity of the metadata. cms.validate() # Recover the metadata. cms.recover() """ class GzipTarDBCorpusWithLabel(GzipTarDBCorpus): def loc_labels(self, loc): info = GzipTarDBCorpus.get_info(self, loc) name = info.name[8:] if name: return [name] return [] class TarCMSError(Exception): pass class ArticleNotFound(TarCMSError): pass def __init__(self, basedir, doctype, encoding='utf-8', indexstyle=None, threshold=100, verbose=False): self.basedir = basedir self.threshold = threshold self.verbose = verbose self._corpus = self.GzipTarDBCorpusWithLabel(os.path.join( basedir, 'src'), doctype, encoding, indexstyle=indexstyle) self._artdb = FixedDB(os.path.join(basedir, 'articles')) self._indexdb = IndexDB(os.path.join(basedir, 'idx'), 'idx') self._loctoindex = None self._mode = None return def __repr__(self): return '<TarCMS: basedir=%r>' % (self.basedir, ) def __iter__(self): return self.list_articles() def create(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.create() self._artdb.create(9) self._indexdb.create() return def open(self, mode='r'): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.open(mode=mode) self._artdb.open(mode=mode) self._indexdb.open() self._loctoindex = set() self._mode = mode return def close(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) self.flush() self._corpus.close() self._artdb.close() self._indexdb.close() self._mode = None return def _add_corpus(self, info, data): assert self._mode is not None tid = self._corpus.add_data(info, data) self._loctoindex.add(tid) if self.threshold and self.threshold <= len(self._loctoindex): self.flush() return tid def _add_file(self, info, path): assert self._mode is not None fp = file(path, 'rb') data = fp.read() fp.close() return self._add_corpus(info, data) def flush(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) self._corpus.flush() self._artdb.flush() indexer = Indexer(self._indexdb, self._corpus, verbose=self.verbose) for tid in self._loctoindex: indexer.index_loc(tid) indexer.finish() self._loctoindex.clear() return def create_article(self, data, info=None): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) if info is None: info = TarInfo() assert isinstance(info, TarInfo) aid = '%08x' % self._artdb.nextrecno() info.name = aid + info.name tid = self._add_corpus(info, data) assert aid == tid self._artdb.add_record(tid) return aid def modify_article(self, aid, data, info=None): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) tid0 = self._artdb.get_record(int(aid, 16)) if info is None: info = self.get_info(tid0) assert isinstance(info, TarInfo) info.name = aid + info.name loc = self._add_corpus(info, data) tid = '%08x' % self._artdb.add_record(tid0) assert loc == tid self._artdb.set_record(int(aid, 16), tid) return tid def list_snapshots(self, aid=None): """Get all revisions of an article.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) if aid is None: for tid in self._artdb: yield tid else: try: tid = self._artdb.get_record(int(aid, 16)) except FixedDB.InvalidRecord: raise TarCMS.ArticleNotFound(aid) while aid != tid: yield tid tid = self._artdb.get_record(int(tid, 16)) yield tid return def list_articles(self): if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) for (aid, tid) in enumerate(self._artdb): aid = '%08x' % aid if aid == tid: yield aid return def find_snapshots(self, preds, disjunctive=False): """Find snapshots that match to the predicates.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) sel = Selection(self._indexdb, preds, disjunctive=disjunctive) for x in sel: yield sel.get_snippet(x) return def find_articles(self, preds, disjunctive=False): sel = self.find_snapshots(preds, disjunctive=disjunctive) aids = set() for (tid, mtime, title, snippet) in sel: try: aid = self._artdb.get_record(int(tid, 16)) except FixedDB.InvalidRecord: raise TarCMS.ArticleNotFound(tid) if aid not in aids: aids.add(aid) yield (aid, mtime, title, snippet) return def get_info(self, tid): """Get the information about the snapshot specified by tid.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) info = self._corpus.get_info(tid) info.name = info.name[8:] return info def get_data(self, tid): """Get a particular revision of article specified by tid.""" if not self._mode: raise TarCMS.TarCMSError('not open: %r' % self) return self._corpus.get_data(tid) def get_latest(self, aid): """Equivalent to self.list_snapshots(aid)[0].""" for tid in self.list_snapshots(aid): return tid raise KeyError(aid) def _get_tids(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.open(mode='r') tids = [] for tid in self._corpus.get_all_locs(): info = self._corpus.get_info(tid) aid = info.name[:8] if tid == aid: tids.append(tid) else: i = int(aid, 16) tids.append(tids[i]) tids[i] = tid self._corpus.close() return tids def validate_artdb(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._artdb.open(mode='r') for (entry, tid) in ezip(self._artdb, self._get_tids()): if entry != tid: raise TarCMS.TarCMSError self._artdb.close() return def recover_artdb(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._artdb.open(mode='w') for tid in self._get_tids(): self._artdb.add_record(tid) self._artdb.close() return def validate(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.validate_catalog() self.validate_artdb() return def recover(self): if self._mode: raise TarCMS.TarCMSError('already open: %r' % self) self._corpus.recover_catalog() self.recover_artdb() self._indexdb.reset() indexer = Indexer(self._indexdb, self._corpus, verbose=verbose) for tid in self._corpus.get_all_locs(): indexer.index_loc(tid) indexer.finish() return