def start(self): self._delta_db = IndexedDatabase( artifact_manager.get_temp_file(config.RCS_DELTAS_STORE), artifact_manager.get_temp_file(config.RCS_DELTAS_INDEX_TABLE), DB_OPEN_READ, ) self._delta_db.__delitem__ = lambda id: None self._tree_db = IndexedDatabase( artifact_manager.get_temp_file(config.RCS_TREES_STORE), artifact_manager.get_temp_file(config.RCS_TREES_INDEX_TABLE), DB_OPEN_READ, ) serializer = MarshalSerializer() if self._compress: serializer = CompressingSerializer(serializer) self._co_db = self._Database( artifact_manager.get_temp_file(config.CVS_CHECKOUT_DB), DB_OPEN_NEW, serializer, ) # The set of CVSFile instances whose TextRecords have already been # read: self._loaded_files = set() # A map { CVSFILE : _FileTree } for files that currently have live # revisions: self._text_record_db = TextRecordDatabase(self._delta_db, self._co_db)
class InternalRevisionCollector(RevisionCollector): """The RevisionCollector used by InternalRevisionReader.""" def __init__(self, compress): RevisionCollector.__init__(self) self._compress = compress def register_artifacts(self, which_pass): artifact_manager.register_temp_file( config.RCS_DELTAS_INDEX_TABLE, which_pass ) artifact_manager.register_temp_file(config.RCS_DELTAS_STORE, which_pass) artifact_manager.register_temp_file( config.RCS_TREES_INDEX_TABLE, which_pass ) artifact_manager.register_temp_file(config.RCS_TREES_STORE, which_pass) def start(self): serializer = MarshalSerializer() if self._compress: serializer = CompressingSerializer(serializer) self._delta_db = IndexedDatabase( artifact_manager.get_temp_file(config.RCS_DELTAS_STORE), artifact_manager.get_temp_file(config.RCS_DELTAS_INDEX_TABLE), DB_OPEN_NEW, serializer, ) primer = (FullTextRecord, DeltaTextRecord) self._rcs_trees = IndexedDatabase( artifact_manager.get_temp_file(config.RCS_TREES_STORE), artifact_manager.get_temp_file(config.RCS_TREES_INDEX_TABLE), DB_OPEN_NEW, PrimedPickleSerializer(primer), ) def _writeout(self, text_record, text): self.text_record_db.add(text_record) self._delta_db[text_record.id] = text def process_file(self, cvs_file_items): """Read revision information for the file described by CVS_FILE_ITEMS. Compute the text record refcounts, discard any records that are unneeded, and store the text records for the file to the _rcs_trees database.""" # A map from cvs_rev_id to TextRecord instance: self.text_record_db = TextRecordDatabase(self._delta_db, NullDatabase()) parse( open(cvs_file_items.cvs_file.rcs_path, 'rb'), _Sink(self, cvs_file_items), ) self.text_record_db.recompute_refcounts(cvs_file_items) self.text_record_db.free_unused() self._rcs_trees[cvs_file_items.cvs_file.id] = self.text_record_db del self.text_record_db def finish(self): self._delta_db.close() self._rcs_trees.close()
def __init__(self): self.cvs_path_db = Ctx()._cvs_path_db self.db = IndexedDatabase( artifact_manager.get_temp_file(config.MIRROR_NODES_STORE), artifact_manager.get_temp_file(config.MIRROR_NODES_INDEX_TABLE), DB_OPEN_NEW, serializer=MarshalSerializer(), ) # A list of the maximum node_id stored by each call to # write_new_nodes(): self._max_node_ids = [0] # A map {node_id : {cvs_path : node_id}}: self._cache = {} # The number of directories in the repository: num_dirs = len([ cvs_path for cvs_path in self.cvs_path_db.itervalues() if isinstance(cvs_path, CVSDirectory) ]) self._cache_max_size = max( int(self.CACHE_SIZE_MULTIPLIER * num_dirs), self.MIN_CACHE_LIMIT, )
def start(self): serializer = MarshalSerializer() if self._compress: serializer = CompressingSerializer(serializer) self._delta_db = IndexedDatabase( artifact_manager.get_temp_file(config.RCS_DELTAS_STORE), artifact_manager.get_temp_file(config.RCS_DELTAS_INDEX_TABLE), DB_OPEN_NEW, serializer, ) primer = (FullTextRecord, DeltaTextRecord) self._rcs_trees = IndexedDatabase( artifact_manager.get_temp_file(config.RCS_TREES_STORE), artifact_manager.get_temp_file(config.RCS_TREES_INDEX_TABLE), DB_OPEN_NEW, PrimedPickleSerializer(primer), )
def __init__(self, mode): self.mode = mode if mode not in (DB_OPEN_NEW, DB_OPEN_READ): raise RuntimeError("Invalid 'mode' argument to PersistenceManager") primer = ( SVNInitialProjectCommit, SVNPrimaryCommit, SVNPostCommit, SVNBranchCommit, SVNTagCommit, ) serializer = PrimedPickleSerializer(primer) self.svn_commit_db = IndexedDatabase( artifact_manager.get_temp_file(config.SVN_COMMITS_INDEX_TABLE), artifact_manager.get_temp_file(config.SVN_COMMITS_STORE), mode, serializer) self.cvs2svn_db = RecordTable( artifact_manager.get_temp_file(config.CVS_REVS_TO_SVN_REVNUMS), mode, SignedIntegerPacker(SVN_INVALID_REVNUM))
def MetadataDatabase(store_filename, index_table_filename, mode): """A database to store Metadata instances that describe CVSRevisions. This database manages a map id -> Metadata instance where id is a unique identifier for the metadata.""" return IndexedDatabase( store_filename, index_table_filename, mode, PrimedPickleSerializer((Metadata, )), )
class InternalRevisionCollector(RevisionCollector): """The RevisionCollector used by InternalRevisionReader.""" def __init__(self, compress): RevisionCollector.__init__(self) self._compress = compress def register_artifacts(self, which_pass): artifact_manager.register_temp_file(config.RCS_DELTAS_INDEX_TABLE, which_pass) artifact_manager.register_temp_file(config.RCS_DELTAS_STORE, which_pass) artifact_manager.register_temp_file(config.RCS_TREES_INDEX_TABLE, which_pass) artifact_manager.register_temp_file(config.RCS_TREES_STORE, which_pass) def start(self): serializer = MarshalSerializer() if self._compress: serializer = CompressingSerializer(serializer) self._delta_db = IndexedDatabase( artifact_manager.get_temp_file(config.RCS_DELTAS_STORE), artifact_manager.get_temp_file(config.RCS_DELTAS_INDEX_TABLE), DB_OPEN_NEW, serializer, ) primer = (FullTextRecord, DeltaTextRecord) self._rcs_trees = IndexedDatabase( artifact_manager.get_temp_file(config.RCS_TREES_STORE), artifact_manager.get_temp_file(config.RCS_TREES_INDEX_TABLE), DB_OPEN_NEW, PrimedPickleSerializer(primer), ) def _writeout(self, text_record, text): self.text_record_db.add(text_record) self._delta_db[text_record.id] = text def process_file(self, cvs_file_items): """Read revision information for the file described by CVS_FILE_ITEMS. Compute the text record refcounts, discard any records that are unneeded, and store the text records for the file to the _rcs_trees database.""" # A map from cvs_rev_id to TextRecord instance: self.text_record_db = TextRecordDatabase(self._delta_db, NullDatabase()) parse( open(cvs_file_items.cvs_file.rcs_path, 'rb'), _Sink(self, cvs_file_items), ) self.text_record_db.recompute_refcounts(cvs_file_items) self.text_record_db.free_unused() self._rcs_trees[cvs_file_items.cvs_file.id] = self.text_record_db del self.text_record_db def finish(self): self._delta_db.close() self._rcs_trees.close()
class PersistenceManager: """The PersistenceManager allows us to effectively store SVNCommits to disk and retrieve them later using only their subversion revision number as the key. It also returns the subversion revision number for a given CVSRevision's unique key. All information pertinent to each SVNCommit is stored in a series of on-disk databases so that SVNCommits can be retrieved on-demand. MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ. In 'new' mode, PersistenceManager will initialize a new set of on-disk databases and be fully-featured. In 'read' mode, PersistenceManager will open existing on-disk databases and the set_* methods will be unavailable.""" def __init__(self, mode): self.mode = mode if mode not in (DB_OPEN_NEW, DB_OPEN_READ): raise RuntimeError("Invalid 'mode' argument to PersistenceManager") primer = ( SVNInitialProjectCommit, SVNPrimaryCommit, SVNPostCommit, SVNBranchCommit, SVNTagCommit, ) serializer = PrimedPickleSerializer(primer) self.svn_commit_db = IndexedDatabase( artifact_manager.get_temp_file(config.SVN_COMMITS_INDEX_TABLE), artifact_manager.get_temp_file(config.SVN_COMMITS_STORE), mode, serializer) self.cvs2svn_db = RecordTable( artifact_manager.get_temp_file(config.CVS_REVS_TO_SVN_REVNUMS), mode, SignedIntegerPacker(SVN_INVALID_REVNUM)) def get_svn_revnum(self, cvs_rev_id): """Return the Subversion revision number in which CVS_REV_ID was committed, or SVN_INVALID_REVNUM if there is no mapping for CVS_REV_ID.""" return self.cvs2svn_db.get(cvs_rev_id, SVN_INVALID_REVNUM) def get_svn_commit(self, svn_revnum): """Return an SVNCommit that corresponds to SVN_REVNUM. If no SVNCommit exists for revnum SVN_REVNUM, then return None.""" return self.svn_commit_db.get(svn_revnum, None) def put_svn_commit(self, svn_commit): """Record the bidirectional mapping between SVN_REVNUM and CVS_REVS and record associated attributes.""" if self.mode == DB_OPEN_READ: raise RuntimeError( 'Write operation attempted on read-only PersistenceManager' ) self.svn_commit_db[svn_commit.revnum] = svn_commit if isinstance(svn_commit, SVNRevisionCommit): for cvs_rev in svn_commit.cvs_revs: self.cvs2svn_db[cvs_rev.id] = svn_commit.revnum def close(self): self.cvs2svn_db.close() self.cvs2svn_db = None self.svn_commit_db.close() self.svn_commit_db = None
class _NodeDatabase(object): """A database storing all of the directory nodes. The nodes are written in groups every time write_new_nodes() is called. To the database is written a dictionary {node_id : [(cvs_path.id, node_id),...]}, where the keys are the node_ids of the new nodes. When a node is read, its whole group is read and cached under the assumption that the other nodes in the group are likely to be needed soon. The cache is retained across revisions and cleared when _cache_max_size is exceeded. The dictionaries for nodes that have been read from the database during the current revision are cached by node_id in the _cache member variable. The corresponding dictionaries are *not* copied when read. To avoid cross-talk between distinct MirrorDirectory instances that have the same node_id, users of these dictionaries have to copy them before modification.""" # How many entries should be allowed in the cache for each # CVSDirectory in the repository. (This number is very roughly the # number of complete lines of development that can be stored in the # cache at one time.) CACHE_SIZE_MULTIPLIER = 5 # But the cache will never be limited to less than this number: MIN_CACHE_LIMIT = 5000 def __init__(self): self.cvs_path_db = Ctx()._cvs_path_db self.db = IndexedDatabase( artifact_manager.get_temp_file(config.MIRROR_NODES_STORE), artifact_manager.get_temp_file(config.MIRROR_NODES_INDEX_TABLE), DB_OPEN_NEW, serializer=MarshalSerializer(), ) # A list of the maximum node_id stored by each call to # write_new_nodes(): self._max_node_ids = [0] # A map {node_id : {cvs_path : node_id}}: self._cache = {} # The number of directories in the repository: num_dirs = len([ cvs_path for cvs_path in self.cvs_path_db.itervalues() if isinstance(cvs_path, CVSDirectory) ]) self._cache_max_size = max( int(self.CACHE_SIZE_MULTIPLIER * num_dirs), self.MIN_CACHE_LIMIT, ) def _load(self, items): retval = {} for (id, value) in items: retval[self.cvs_path_db.get_path(id)] = value return retval def _dump(self, node): return [(cvs_path.id, value) for (cvs_path, value) in node.iteritems()] def _determine_index(self, id): """Return the index of the record holding the node with ID.""" return bisect.bisect_left(self._max_node_ids, id) def __getitem__(self, id): try: items = self._cache[id] except KeyError: index = self._determine_index(id) for (node_id, items) in self.db[index].items(): self._cache[node_id] = self._load(items) items = self._cache[id] return items def write_new_nodes(self, nodes): """Write NODES to the database. NODES is an iterable of writable CurrentMirrorDirectory instances.""" if len(self._cache) > self._cache_max_size: # The size of the cache has exceeded the threshold. Discard the # old cache values (but still store the new nodes into the # cache): logger.debug('Clearing node cache') self._cache.clear() data = {} max_node_id = 0 for node in nodes: max_node_id = max(max_node_id, node.id) data[node.id] = self._dump(node._entries) self._cache[node.id] = node._entries self.db[len(self._max_node_ids)] = data if max_node_id == 0: # Rewrite last value: self._max_node_ids.append(self._max_node_ids[-1]) else: self._max_node_ids.append(max_node_id) def close(self): self._cache.clear() self.db.close() self.db = None
class PersistenceManager: """The PersistenceManager allows us to effectively store SVNCommits to disk and retrieve them later using only their subversion revision number as the key. It also returns the subversion revision number for a given CVSRevision's unique key. All information pertinent to each SVNCommit is stored in a series of on-disk databases so that SVNCommits can be retrieved on-demand. MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ. In 'new' mode, PersistenceManager will initialize a new set of on-disk databases and be fully-featured. In 'read' mode, PersistenceManager will open existing on-disk databases and the set_* methods will be unavailable.""" def __init__(self, mode): self.mode = mode if mode not in (DB_OPEN_NEW, DB_OPEN_READ): raise RuntimeError("Invalid 'mode' argument to PersistenceManager") primer = ( SVNInitialProjectCommit, SVNPrimaryCommit, SVNPostCommit, SVNBranchCommit, SVNTagCommit, ) serializer = PrimedPickleSerializer(primer) self.svn_commit_db = IndexedDatabase( artifact_manager.get_temp_file(config.SVN_COMMITS_INDEX_TABLE), artifact_manager.get_temp_file(config.SVN_COMMITS_STORE), mode, serializer) self.cvs2svn_db = RecordTable( artifact_manager.get_temp_file(config.CVS_REVS_TO_SVN_REVNUMS), mode, SignedIntegerPacker(SVN_INVALID_REVNUM)) def get_svn_revnum(self, cvs_rev_id): """Return the Subversion revision number in which CVS_REV_ID was committed, or SVN_INVALID_REVNUM if there is no mapping for CVS_REV_ID.""" return self.cvs2svn_db.get(cvs_rev_id, SVN_INVALID_REVNUM) def get_svn_commit(self, svn_revnum): """Return an SVNCommit that corresponds to SVN_REVNUM. If no SVNCommit exists for revnum SVN_REVNUM, then return None.""" return self.svn_commit_db.get(svn_revnum, None) def put_svn_commit(self, svn_commit): """Record the bidirectional mapping between SVN_REVNUM and CVS_REVS and record associated attributes.""" if self.mode == DB_OPEN_READ: raise RuntimeError( 'Write operation attempted on read-only PersistenceManager') self.svn_commit_db[svn_commit.revnum] = svn_commit if isinstance(svn_commit, SVNRevisionCommit): for cvs_rev in svn_commit.cvs_revs: self.cvs2svn_db[cvs_rev.id] = svn_commit.revnum def close(self): self.cvs2svn_db.close() self.cvs2svn_db = None self.svn_commit_db.close() self.svn_commit_db = None
class _NodeDatabase(object): """A database storing all of the directory nodes. The nodes are written in groups every time write_new_nodes() is called. To the database is written a dictionary {node_id : [(cvs_path.id, node_id),...]}, where the keys are the node_ids of the new nodes. When a node is read, its whole group is read and cached under the assumption that the other nodes in the group are likely to be needed soon. The cache is retained across revisions and cleared when _cache_max_size is exceeded. The dictionaries for nodes that have been read from the database during the current revision are cached by node_id in the _cache member variable. The corresponding dictionaries are *not* copied when read. To avoid cross-talk between distinct MirrorDirectory instances that have the same node_id, users of these dictionaries have to copy them before modification.""" # How many entries should be allowed in the cache for each # CVSDirectory in the repository. (This number is very roughly the # number of complete lines of development that can be stored in the # cache at one time.) CACHE_SIZE_MULTIPLIER = 5 # But the cache will never be limited to less than this number: MIN_CACHE_LIMIT = 5000 def __init__(self): self.cvs_path_db = Ctx()._cvs_path_db self.db = IndexedDatabase( artifact_manager.get_temp_file(config.MIRROR_NODES_STORE), artifact_manager.get_temp_file(config.MIRROR_NODES_INDEX_TABLE), DB_OPEN_NEW, serializer=MarshalSerializer(), ) # A list of the maximum node_id stored by each call to # write_new_nodes(): self._max_node_ids = [0] # A map {node_id : {cvs_path : node_id}}: self._cache = {} # The number of directories in the repository: num_dirs = len([ cvs_path for cvs_path in self.cvs_path_db.itervalues() if isinstance(cvs_path, CVSDirectory) ]) self._cache_max_size = max( int(self.CACHE_SIZE_MULTIPLIER * num_dirs), self.MIN_CACHE_LIMIT, ) def _load(self, items): retval = {} for (id, value) in items: retval[self.cvs_path_db.get_path(id)] = value return retval def _dump(self, node): return [ (cvs_path.id, value) for (cvs_path, value) in node.iteritems() ] def _determine_index(self, id): """Return the index of the record holding the node with ID.""" return bisect.bisect_left(self._max_node_ids, id) def __getitem__(self, id): try: items = self._cache[id] except KeyError: index = self._determine_index(id) for (node_id, items) in self.db[index].items(): self._cache[node_id] = self._load(items) items = self._cache[id] return items def write_new_nodes(self, nodes): """Write NODES to the database. NODES is an iterable of writable CurrentMirrorDirectory instances.""" if len(self._cache) > self._cache_max_size: # The size of the cache has exceeded the threshold. Discard the # old cache values (but still store the new nodes into the # cache): logger.debug('Clearing node cache') self._cache.clear() data = {} max_node_id = 0 for node in nodes: max_node_id = max(max_node_id, node.id) data[node.id] = self._dump(node._entries) self._cache[node.id] = node._entries self.db[len(self._max_node_ids)] = data if max_node_id == 0: # Rewrite last value: self._max_node_ids.append(self._max_node_ids[-1]) else: self._max_node_ids.append(max_node_id) def close(self): self._cache.clear() self.db.close() self.db = None