def ensure_person(person): profiler_start("Ensuring person %s for repository %d", (person.name, self.repo_id)) printdbg("DBContentHandler: ensure_person %s <%s>", (person.name, person.email)) cursor = self.cursor name = to_utf8(person.name) email = person.email if email is not None: email = to_utf8(email).decode("utf-8") cursor.execute(statement( "SELECT id from people where name = ?", self.db.place_holder), (to_utf8(name).decode("utf-8"),)) rs = cursor.fetchone() if not rs: p = DBPerson(None, person) cursor.execute(statement(DBPerson.__insert__, self.db.place_holder), (p.id, to_utf8(p.name).decode("utf-8"), email)) person_id = p.id else: person_id = rs[0] profiler_stop("Ensuring person %s for repository %d", (person.name, self.repo_id), True) return person_id
def __insert_many(self): if not self.actions and not self.commits: return cursor = self.cursor if self.actions: profiler_start("Inserting actions for repository %d", (self.repo_id, )) for a in self.actions: action_tuple = (a.id, a.type, a.file_id, a.commit_id, a.branch_id, a.current_file_path) if isinstance(self.db, MysqlDatabase): import MySQLdb try: cursor.execute( statement(DBAction.__insert__, self.db.place_holder), action_tuple) except MySQLdb.IntegrityError, e: if e.args[0] == 1062: # Duplicate entry pass else: cursor.execute( statement(DBAction.__insert__, self.db.place_holder), action_tuple) self.actions = [] profiler_stop("Inserting actions for repository %d", (self.repo_id, ))
def __insert_many(self): if not self.actions and not self.commits: return cursor = self.cursor if self.actions: profiler_start("Inserting actions for repository %d", (self.repo_id,)) for a in self.actions: action_tuple = (a.id, a.type, a.file_id, a.commit_id, a.branch_id, a.current_file_path) if isinstance(self.db, MysqlDatabase): import MySQLdb try: cursor.execute(statement(DBAction.__insert__, self.db.place_holder), action_tuple) except MySQLdb.IntegrityError, e: if e.args[0] == 1062: # Duplicate entry pass else: cursor.execute(statement(DBAction.__insert__, self.db.place_holder), action_tuple) self.actions = [] profiler_stop("Inserting actions for repository %d", (self.repo_id,))
def end(self): # flush pending inserts printdbg("DBContentHandler: flushing pending inserts") self.__insert_many() # Save the caches to disk profiler_start("Saving caches to disk") self.__save_caches_to_disk() profiler_stop("Saving caches to disk", delete=True) self.cursor.close() self.cnn.close() self.cnn = None
def ensure_tag(tag): profiler_start("Ensuring tag %s for repository %d", (tag, self.repo_id)) printdbg("DBContentHandler: ensure_tag %s", (tag,)) cursor = self.cursor cursor.execute(statement("SELECT id from tags where name = ?", self.db.place_holder), (tag,)) rs = cursor.fetchone() if not rs: t = DBTag(None, tag) cursor.execute(statement(DBTag.__insert__, self.db.place_holder), (t.id, t.name)) tag_id = t.id else: tag_id = rs[0] profiler_stop("Ensuring tag %s for repository %d", (tag, self.repo_id), True) return tag_id
def ensure_branch(branch): profiler_start("Ensuring branch %s for repository %d", (branch, self.repo_id)) printdbg("DBContentHandler: ensure_branch %s", (branch,)) cursor = self.cursor cursor.execute(statement("SELECT id from branches where name = ?", self.db.place_holder), (branch,)) rs = cursor.fetchone() if not rs: b = DBBranch(None, branch) cursor.execute(statement(DBBranch.__insert__, self.db.place_holder), (b.id, b.name)) branch_id = b.id else: branch_id = rs[0] profiler_stop("Ensuring branch %s for repository %d", (branch, self.repo_id), True) return branch_id
def ensure_path(path, commit_id): profiler_start("Ensuring path %s for repository %d", (path, self.repo_id)) printdbg("DBContentHandler: ensure_path %s", (path,)) prefix, lpath = path.split("://", 1) prefix += "://" tokens = lpath.strip('/').split('/') parent = -1 node_id = None for i, token in enumerate(tokens): rpath = prefix + '/' + '/'.join(tokens[:i + 1]) if not ":///" in path: # If the repo paths don't start with / # remove it here rpath = rpath.replace(':///', '://') printdbg("DBContentHandler: rpath: %s", (rpath,)) try: node_id, parent_id = self.file_cache[rpath] parent = node_id continue except: pass # Rpath not in cache, add it node_id = self.__add_new_file_and_link(token, parent, commit_id) parent_id = parent parent = node_id # Also add to file_paths self.__add_file_path(commit_id, node_id, re.sub('^\d+://', '', rpath)) self.file_cache[rpath] = (node_id, parent_id) assert node_id is not None printdbg("DBContentHandler: path ensured %s = %d (%d)", (path, node_id, parent_id)) profiler_stop("Ensuring path %s for repository %d", (path, self.repo_id), True) return node_id, parent_id
def ensure_tag (tag): profiler_start ("Ensuring tag %s for repository %d", (tag, self.repo_id)) printdbg ("DBContentHandler: ensure_tag %s", (tag,)) cursor = self.cursor cursor.execute (statement ("SELECT id from tags where name = ?", self.db.place_holder), (tag,)) rs = cursor.fetchone () if not rs: t = DBTag (None, tag) cursor.execute (statement (DBTag.__insert__, self.db.place_holder), (t.id, t.name)) tag_id = t.id else: tag_id = rs[0] profiler_stop ("Ensuring tag %s for repository %d", (tag, self.repo_id), True) return tag_id
def __insert_many(self): if not self.actions and not self.commits: return cursor = self.cursor if self.actions: actions = [(a.id, a.type, a.file_id, a.commit_id, a.branch_id) \ for a in self.actions] profiler_start("Inserting actions for repository %d", (self.repo_id,)) cursor.executemany(statement(DBAction.__insert__, self.db.place_holder), actions) self.actions = [] profiler_stop("Inserting actions for repository %d", (self.repo_id,)) if self.commits: commits = [(c.id, c.rev, c.committer, c.author, c.date, \ to_utf8(c.message).decode("utf-8"), c.composed_rev, \ c.repository_id) for c in self.commits] profiler_start("Inserting commits for repository %d", (self.repo_id,)) cursor.executemany(statement(DBLog.__insert__, self.db.place_holder), commits) self.commits = [] profiler_stop("Inserting commits for repository %d", (self.repo_id,)) profiler_start("Committing inserts for repository %d", (self.repo_id,)) self.cnn.commit() profiler_stop("Committing inserts for repository %d", (self.repo_id,))
def __insert_many(self): if not self.actions and not self.commits: return cursor = self.cursor if self.actions: actions = [(a.id, a.type, a.file_id, a.commit_id, a.branch_id) for a in self.actions] profiler_start("Inserting actions for repository %d", (self.repo_id,)) cursor.executemany(statement(DBAction.__insert__, self.db.place_holder), actions) self.actions = [] profiler_stop("Inserting actions for repository %d", (self.repo_id,)) if self.commits: commits = [ (c.id, c.rev, c.committer, c.author, c.date, c.date_tz, c.author_date, c.author_date_tz, c.message, c.composed_rev, c.repository_id) for c in self.commits] profiler_start("Inserting commits for repository %d", (self.repo_id,)) cursor.executemany(statement(DBLog.__insert__, self.db.place_holder), commits) p = re.compile('((?:(?:OA)|(?:CCIESC))-\d+)', re.IGNORECASE) for commit in commits: m = p.findall(commit[8]) for bug in m: issue_commit_link=(commit[0], bug) cursor.execute(statement(DBIssueCommitLink.__insert__, self.db.place_holder), issue_commit_link) self.commits = [] profiler_stop("Inserting commits for repository %d", (self.repo_id,)) profiler_start("Committing inserts for repository %d", (self.repo_id,)) self.cnn.commit() profiler_stop("Committing inserts for repository %d", (self.repo_id,))
def __insert_many(self): if not self.actions and not self.commits: return cursor = self.cursor if self.actions: actions = [(a.id, a.type, a.file_id, a.commit_id, a.branch_id) for a in self.actions] profiler_start("Inserting actions for repository %d", (self.repo_id, )) cursor.executemany( statement(DBAction.__insert__, self.db.place_holder), actions) self.actions = [] profiler_stop("Inserting actions for repository %d", (self.repo_id, )) if self.commits: commits = [(c.id, c.rev, c.committer, c.author, c.date, c.date_tz, c.author_date, c.author_date_tz, c.message, c.composed_rev, c.repository_id) for c in self.commits] profiler_start("Inserting commits for repository %d", (self.repo_id, )) cursor.executemany( statement(DBLog.__insert__, self.db.place_holder), commits) p = re.compile('((?:(?:OA)|(?:CCIESC))-\d+)', re.IGNORECASE) for commit in commits: m = p.findall(commit[8]) for bug in m: issue_commit_link = (commit[0], bug) cursor.execute( statement(DBIssueCommitLink.__insert__, self.db.place_holder), issue_commit_link) self.commits = [] profiler_stop("Inserting commits for repository %d", (self.repo_id, )) profiler_start("Committing inserts for repository %d", (self.repo_id, )) self.cnn.commit() profiler_stop("Committing inserts for repository %d", (self.repo_id, ))
def commit(self, commit): if commit.revision in self.revision_cache: return profiler_start("New commit %s for repository %d", (commit.revision, self.repo_id)) log = DBLog(None, commit) log.repository_id = self.repo_id self.revision_cache[commit.revision] = log.id log.committer = self.__get_person(commit.committer) if commit.author == commit.committer: log.author = log.committer elif commit.author is not None: log.author = self.__get_person(commit.author) self.commits.append(log) printdbg("DBContentHandler: commit: %d rev: %s", (log.id, log.rev)) # TODO: sort actions? R, A, D, M, V, C for action in commit.actions: printdbg("DBContentHandler: Action: %s", (action.type,)) dbaction = DBAction(None, action.type) dbaction.commit_id = log.id branch = commit.branch or action.branch_f1 branch_id = self.__get_branch(branch) dbaction.branch_id = branch_id prefix = "%d://" % (branch_id) path = prefix + action.f1 if action.type == 'A': # A file has been added file_id = self.__action_add(path, prefix, log) elif action.type == 'M': # A file has been modified file_id = self.__get_file_for_path(path, log.id)[0] elif action.type == 'D': # A file has been deleted file_id = self.__action_delete(path, log) elif action.type == 'V': # A file has been renamed file_id = self.__action_rename(path, prefix, log, action, dbaction) elif action.type == 'C': # A file has been copied file_id = self.__action_copy(path, prefix, log, action, dbaction) elif action.type == 'R': # A file has been replaced file_id = self.__action_replace(path, prefix, log, action, dbaction) if file_id is None: continue else: assert "Unknown action type %s" % (action.type) dbaction.file_id = file_id self.actions.append(dbaction) # Tags if commit.tags is not None: tag_revs = [] for tag in commit.tags: tag_id = self.__get_tag(tag) db_tagrev = DBTagRev(None) tag_revs.append((db_tagrev.id, tag_id, log.id)) self.cursor.executemany(statement(DBTagRev.__insert__, self.db.place_holder), tag_revs) if len(self.actions) >= self.MAX_ACTIONS: printdbg("DBContentHandler: %d actions inserting", (len(self.actions),)) self.__insert_many() profiler_stop("New commit %s for repository %d", (commit.revision, self.repo_id), True)
class DBContentHandler(ContentHandler): MAX_ACTIONS = 100 def __init__(self, db): ContentHandler.__init__(self) self.db = db self.cnn = None self.cursor = None self.__init_caches() def __init_caches(self): self.file_cache = {} self.moves_cache = {} self.deletes_cache = {} self.revision_cache = {} self.branch_cache = {} self.tags_cache = {} self.people_cache = {} def __save_caches_to_disk(self): printdbg("DBContentHandler: Saving caches to disk (%s)", (self.cache_file, )) cache = [ self.file_cache, self.moves_cache, self.deletes_cache, self.revision_cache, self.branch_cache, self.tags_cache, self.people_cache ] f = open(self.cache_file, 'w') dump(cache, f, -1) f.close() def __load_caches_from_disk(self): printdbg("DBContentHandler: Loading caches from disk (%s)", (self.cache_file, )) f = open(self.cache_file, 'r') (self.file_cache, self.moves_cache, self.deletes_cache, self.revision_cache, self.branch_cache, self.tags_cache, self.people_cache) = load(f) f.close() def __del__(self): if self.cnn is not None: self.cnn.close() def begin(self, order=None): self.cnn = self.db.connect() self.cursor = self.cnn.cursor() self.commits = [] self.actions = [] def repository(self, uri): cursor = self.cursor cursor.execute( statement("SELECT id from repositories where uri = ?", self.db.place_holder), (uri, )) self.repo_id = cursor.fetchone()[0] last_rev = last_commit = None query = """SELECT rev, id from scmlog where id = (select max(id) from scmlog where repository_id = ?)""" cursor.execute(statement(query, self.db.place_holder), (self.repo_id, )) rs = cursor.fetchone() if rs is not None: last_rev, last_commit = rs filename = uri.replace('/', '_') self.cache_file = os.path.join(cvsanaly_cache_dir(), filename) # if there's a previous cache file, just use it if os.path.isfile(self.cache_file): self.__load_caches_from_disk() if last_rev is not None: try: commit_id = self.revision_cache[last_rev] except KeyError: msg = "".join([ "Cache file %s is not up to date or it's corrupt: " % \ (self.cache_file), "Revision %s was not found in the cache file" % \ (last_rev), "It's not possible to continue, the cache ", "file should be removed and the database cleaned up"]) raise CacheFileMismatch(msg) if commit_id != last_commit: # Cache and db don't match, removing cache msg = "".join([ "Cache file %s is not up to date or it's corrupt: " % \ (self.cache_file), "Commit id mismatch for revision %s " % (last_rev), "(File Cache:%d, Database: %d). " % \ (commit_id, last_commit), "It's not possible to continue, the cache ", "file should be removed and the database cleaned up"]) raise CacheFileMismatch(msg) else: # Database looks empty (or corrupt) and we have # a cache file. We can just remove it and continue # normally self.__init_caches() os.remove(self.cache_file) printout("Database looks empty, removing cache file %s", (self.cache_file, )) elif last_rev is not None: # There are data in the database, # but we don't have a cache file!!! msg = "".join([ "Cache file %s is not up to date or it's corrupt: " % \ (self.cache_file), "Cache file cannot be found", "It's not possible to continue, the database ", "should be cleaned up"]) raise CacheFileMismatch(msg) def __insert_many(self): if not self.actions and not self.commits: return cursor = self.cursor if self.actions: profiler_start("Inserting actions for repository %d", (self.repo_id, )) for a in self.actions: action_tuple = (a.id, a.type, a.file_id, a.commit_id, a.branch_id, a.current_file_path) if isinstance(self.db, MysqlDatabase): import MySQLdb try: cursor.execute( statement(DBAction.__insert__, self.db.place_holder), action_tuple) except MySQLdb.IntegrityError, e: if e.args[0] == 1062: # Duplicate entry pass else: cursor.execute( statement(DBAction.__insert__, self.db.place_holder), action_tuple) self.actions = [] profiler_stop("Inserting actions for repository %d", (self.repo_id, )) if self.commits: commits = [(c.id, c.rev, c.committer, c.author, c.commit_date, \ c.author_date, to_utf8(c.message).decode("utf-8"), \ c.composed_rev, c.repository_id) for c in self.commits] profiler_start("Inserting commits for repository %d", (self.repo_id, )) cursor.executemany( statement(DBLog.__insert__, self.db.place_holder), commits) self.commits = [] profiler_stop("Inserting commits for repository %d", (self.repo_id, )) profiler_start("Committing inserts for repository %d", (self.repo_id, )) self.cnn.commit() profiler_stop("Committing inserts for repository %d", (self.repo_id, ))