def get_file_id(self, file_path, commit_id): """Ask for the file_id for a given file_path and commit_id""" if config.debug: profiler_start("Getting file id for file_path %s and commit_id %d", (file_path, commit_id)) db = self.__dict__['db'] cnn = db.connect() cursor = cnn.cursor() query = """SELECT file_id from file_paths WHERE file_path = ? AND commit_id <= ? ORDER BY commit_id DESC LIMIT 1""" cursor.execute(statement(query, db.place_holder), (file_path, commit_id)) try: file_id = cursor.fetchone()[0] except: file_id = None cursor.close() cnn.close() if config.debug: profiler_stop("Getting file id for file_path %s and commit_id %d", (file_path, commit_id), True) return file_id
def get_path_from_database(self, file_id, commit_id): """Returns the last valid path for a given file_id at commit_id (May have been removed afterwords!)""" if config.debug: profiler_start("Getting full file path for file_id %d and \ commit_id %d", (file_id, commit_id)) db = self.__dict__['db'] cnn = db.connect() cursor = cnn.cursor() query = """SELECT file_path from file_paths WHERE file_id=? AND commit_id <= ? ORDER BY commit_id DESC LIMIT 1""" cursor.execute(statement(query, db.place_holder), (file_id, commit_id)) try: file_path = cursor.fetchone()[0] except: file_path = None cursor.close() cnn.close() printdbg("get_path_from_database:\ Path for file_id %d at commit_id %d: %s", (file_id, commit_id, file_path)) if config.debug: profiler_stop("Getting full file path for file_id %d and\ commit_id %d", (file_id, commit_id), True) return file_path
def get_path_from_database(self, file_id, commit_id): """Returns the last valid path for a given file_id at commit_id (May have been removed afterwords!)""" if config.debug: profiler_start("Getting full file path for file_id %d and \ commit_id %d", (file_id, commit_id)) db = self.__dict__['db'] cnn = db.connect() cursor = cnn.cursor() query = """SELECT current_file_path from actions WHERE file_id=? AND commit_id <= ? ORDER BY commit_id DESC LIMIT 1""" cursor.execute(statement(query, db.place_holder), (file_id, commit_id)) try: file_path = cursor.fetchone()[0] except: file_path = None cursor.close() cnn.close() printdbg("get_path_from_database:\ Path for file_id %d at commit_id %d: %s", (file_id, commit_id, file_path)) if config.debug: profiler_stop("Getting full file path for file_id %d and\ commit_id %d", (file_id, commit_id), True) return file_path
def get_file_id(self, file_path, commit_id): """Ask for the file_id for a given file_path and commit_id""" if config.debug: profiler_start("Getting file id for file_path %s and commit_id %d", (file_path, commit_id)) db = self.__dict__['db'] cnn = db.connect() cursor = cnn.cursor() query = """SELECT file_id from actions WHERE binary current_file_path = ? AND commit_id = ? ORDER BY commit_id DESC LIMIT 1""" cursor.execute(statement(query, db.place_holder), (file_path, commit_id)) try: file_id = cursor.fetchone()[0] except: file_id = None cursor.close() cnn.close() if config.debug: profiler_stop("Getting file id for file_path %s and commit_id %d", (file_path, commit_id), True) return file_id
def get_path(self, file_id, commit_id, repo_id): profiler_start("Getting path for file %d at commit %d", (file_id, commit_id)) adj = self.__dict__['adj'] assert adj is not None, "Matrix no updated" path = self.__build_path(file_id, adj) profiler_stop("Getting path for file %d at commit %d", (file_id, commit_id), True) return path
def update_all(self, repo_id): """ update_all enable cache for adjacency matrices Pros: File paths in different revisions can be accessed randomly, i.e. after calling update_all, get_path can be called with any revision in any order. Cons: It consumes significant memory to store the adjacency matrices If the config has low_memory set to true, shelve will be used instead, to write the cache out to disk. """ profiler_start("Update all file paths") if Config().low_memory: self.shelve_file_name = str(time()) + "-shelve.db" # If there is an old file, shelf will complain viciously if os.path.exists(self.shelve_file_name): os.remove(self.shelve_file_name) self.__dict__['cached_adj'] = shelve.open(self.shelve_file_name, writeback=False) db = self.__dict__['db'] cnn = db.connect() cursor = cnn.cursor() query = """select distinct(s.id) from scmlog s, actions a where s.id = a.commit_id and repository_id=? order by s.date""" cursor.execute(statement(query, db.place_holder), (repo_id,)) old_id = -1 all_commits = [i[0] for i in cursor.fetchall()] for id in all_commits: if old_id != id: adj = self.__dict__['cached_adj'].get(str(id)) if adj is None: self.update_for_revision(cursor, id, repo_id) self.__dict__['cached_adj'][str(id)] = \ deepcopy(self.__dict__['adj']) old_id = id cursor.close() cnn.close() profiler_stop("Update all file paths", delete=True)
def update_all(self, repo_id): """ update_all enable cache for adjacency matrices Pros: File paths in different revisions can be accessed randomly, i.e. after calling update_all, get_path can be called with any revision in any order. Cons: It consumes significant memory to store the adjacency matrices If the config has low_memory set to true, shelve will be used instead, to write the cache out to disk. """ profiler_start("Update all file paths") if Config().low_memory: self.shelve_file_name = str(time()) + "-shelve.db" # If there is an old file, shelf will complain viciously if os.path.exists(self.shelve_file_name): os.remove(self.shelve_file_name) self.__dict__['cached_adj'] = shelve.open(self.shelve_file_name, writeback=False) db = self.__dict__['db'] cnn = db.connect() cursor = cnn.cursor() query = """select distinct(s.id) from scmlog s, actions a where s.id = a.commit_id and repository_id=? order by s.commit_date""" cursor.execute(statement(query, db.place_holder), (repo_id,)) old_id = -1 all_commits = [i[0] for i in cursor.fetchall()] for id in all_commits: if old_id != id: adj = self.__dict__['cached_adj'].get(str(id)) if adj is None: self.update_for_revision(cursor, id, repo_id) self.__dict__['cached_adj'][str(id)] = \ deepcopy(self.__dict__['adj']) old_id = id cursor.close() cnn.close() profiler_stop("Update all file paths", delete=True)
def __build_path(self, file_id, adj): if file_id not in adj.adj: return None profiler_start("Building path for file %d", (file_id,)) tokens = [] id = file_id while id != -1: tokens.insert(0, adj.files[id]) id = adj.adj[id] profiler_stop("Building path for file %d", (file_id,), True) return "/" + "/".join(tokens)
def __build_path(self, file_id, adj): if file_id not in adj.adj: return None profiler_start("Building path for file %d", (file_id,)) tokens = [] id = file_id while id != -1: tokens.insert(0, adj.files[id]) id = adj.adj.get(id,-1) profiler_stop("Building path for file %d", (file_id,), True) return "/" + "/".join(tokens)
class BlameJob (Job): class BlameContentHandler (OutputDevice): def __init__ (self): self.authors = {} def start_file (self, filename): pass def line (self, line): self.authors.setdefault (line.author, 0) self.authors[line.author] += 1 def end_file (self): pass def get_authors (self): return self.authors def __init__ (self, file_id, commit_id, path, rev): Job.__init__(self) self.file_id = file_id self.commit_id = commit_id self.path = path self.rev = rev self.authors = None def run (self, repo, repo_uri): profiler_start("Running BlameJob for %s@%s", (self.path,self.rev)) def blame_line (line, p): p.feed (line) repo_type = repo.get_type () if repo_type == 'cvs': # CVS paths contain the module stuff uri = repo.get_uri_for_path (repo_uri) module = uri[len (repo.get_uri ()):].strip ('/') if module != '.': path = self.path[len (module):].strip ('/') else: path = self.path.strip ('/') else: path = self.path.strip ('/') filename = os.path.basename (self.path) p = create_parser (repo.get_type (), self.path) out = self.get_content_handler() p.set_output_device (out) wid = repo.add_watch (BLAME, blame_line, p) try: repo.blame (os.path.join (repo_uri, path), self.rev) self.collect_results(out) except RepositoryCommandError, e: self.failed = True printerr ("Command %s returned %d (%s)", (e.cmd, e.returncode, e.error)) p.end () repo.remove_watch(BLAME, wid) profiler_stop("Running BlameJob for %s@%s", (self.path,self.rev), delete=True)
def __build_path(self, file_id, adj): if file_id not in adj.adj: return None profiler_start("Building path for file %d", (file_id,)) tokens = [] id = file_id while id is not None and id != -1: tokens.insert(0, adj.files[id]) #use get instead of index to avoid key error id = adj.adj.get(id) profiler_stop("Building path for file %d", (file_id,), True) return "/" + "/".join(tokens)
def update_all(self, repo_id): profiler_start("Update all file paths") db = self.__dict__['db'] cnn = db.connect () cursor = cnn.cursor () query = """select distinct(s.id) from scmlog s, actions a where s.id = a.commit_id and repository_id=? order by s.id""" cursor.execute (statement (query, db.place_holder), (repo_id,)) old_id = -1 all_commits = [i[0] for i in cursor.fetchall ()] for id in all_commits: if old_id != id: self.update_for_revision (cursor, id, repo_id) old_id = id cursor.close() cnn.close() profiler_stop("Update all file paths", delete=True)
def get_patches(self, repo, repo_uri, repo_id, db, cursor): profiler_start("Hunks: fetch all patches") icursor = ICursor(cursor, self.INTERVAL_SIZE) # Get the patches from this repository query = """select p.commit_id, p.file_id, p.patch, s.rev from patches p, scmlog s where p.commit_id = s.id and s.repository_id = ? and p.patch is not NULL""" icursor.execute(statement(query, db.place_holder), (repo_id,)) profiler_stop("Hunks: fetch all patches", delete=True) rs = icursor.fetchmany() while rs: for commit_id, file_id, patch_content, rev in rs: yield (commit_id, file_id, to_utf8(patch_content), rev) rs = icursor.fetchmany()
def get_patches(self, repo, repo_uri, repo_id, db, cursor): profiler_start("Hunks: fetch all patches") icursor = ICursor(cursor, self.INTERVAL_SIZE) # Get the patches from this repository query = """select p.commit_id, p.file_id, p.patch, s.rev from patches p, scmlog s where p.commit_id = s.id and s.repository_id = ? and p.patch is not NULL""" icursor.execute(statement(query, db.place_holder), (repo_id, )) profiler_stop("Hunks: fetch all patches", delete=True) rs = icursor.fetchmany() while rs: for commit_id, file_id, patch_content, rev in rs: yield (commit_id, file_id, to_utf8(patch_content), rev) rs = icursor.fetchmany()
def update_all(self, repo_id): profiler_start("Update all file paths") db = self.__dict__['db'] cnn = db.connect() cursor = cnn.cursor() query = """select distinct(s.id) from scmlog s, actions a where s.id = a.commit_id and repository_id=? order by s.id""" cursor.execute(statement(query, db.place_holder), (repo_id, )) old_id = -1 all_commits = [i[0] for i in cursor.fetchall()] for id in all_commits: if old_id != id: self.update_for_revision(cursor, id, repo_id) old_id = id cursor.close() cnn.close() profiler_stop("Update all file paths", delete=True)
def get_commit_data(self, patch_content): profiler_start("get_commit_data") lines = [l + "\n" for l in patch_content.splitlines() if l] hunks = [] for patch in [p for p in parse_patches(lines, allow_dirty=True, \ allow_continue=True) if isinstance(p, Patch)]: # This method matches that of parseLine in UnifiedDiffParser.java # It's not necessarily intuitive, but this algorithm is much harder # than it looks, I spent hours trying to get a simpler solution. # It does, however, seem to work, which is pretty amazing when # you think about how difficult it is for long enough. # The trick that this method does is that each *part* of a hunk # ie. added, deleted, changed are treated as *new entities*. # The EntityDelta table does not store just diffs, it stores # each part of a diff. # I will need to copy the behavior of how Sep inserts a CommitData # into the database to ensure things match for hunk in patch.hunks: old_start_line = hunk.orig_pos - 1 new_start_line = hunk.mod_pos - 1 old_end_line = 0 new_end_line = 0 added = False deleted = False in_change = False for line in hunk.lines: if isinstance(line, RemoveLine): if not in_change or not deleted: in_change = True old_start_line += 1 old_end_line = old_start_line else: old_end_line += 1 deleted = True elif isinstance(line, InsertLine): if not in_change or not added: in_change = True new_start_line += 1 new_end_line = new_start_line else: new_end_line += 1 added = True elif isinstance(line, ContextLine): if in_change: in_change = False printdbg("Patch new name: " + patch.newname) file_name = re.split('\s+', patch.newname)[0] if file_name == "/dev/null": file_name = re.split('\s+', patch.oldname)[0] cd = CommitData(file_name) if deleted: cd.old_start_line = old_start_line cd.old_end_line = old_end_line old_start_line = old_end_line if added: cd.new_start_line = new_start_line cd.new_end_line = new_end_line new_start_line = new_end_line hunks.append(cd) added = deleted = False old_start_line += 1 new_start_line += 1 # The diff ended without a new context line if in_change: cd = CommitData(re.split('\s+', patch.newname)[0]) if deleted: cd.old_start_line = old_start_line cd.old_end_line = old_end_line if added: cd.new_start_line = new_start_line cd.new_end_line = new_end_line hunks.append(cd) profiler_stop("get_commit_data") return hunks
execute_statement(statement(insert, db.place_holder), (file_id, commit_id, hunk.old_start_line, hunk.old_end_line, hunk.new_start_line, hunk.new_end_line), write_cursor, db, "Couldn't insert hunk, dup record?", exception=ExtensionRunError) connection.commit() progress.finished_one() read_cursor.close() connection.commit() connection.close() progress.done() # This turns off the profiler and deletes its timings profiler_stop("Running hunks extension", delete=True) def backout(self, repo, uri, db): update_statement = """delete from hunks where commit_id in (select s.id from scmlog s where s.repository_id = ?)""" self._do_backout(repo, uri, db, update_statement) register_extension("Hunks", Hunks)
p.patch is not NULL""", db.place_holder), (repo_id, )) nr_records = cursor.fetchone()[0] progress = Progress("[Extension PatchesLOC]", nr_records) patches = self.get_patches(repo, path or repo.get_uri(), repo_id, db, cursor) for commit_id, file_id, patch_content, rev in patches: (added, removed) = self.count_lines(patch_content) insert = """insert into patch_lines(file_id, commit_id, added, removed) values(?,?,?,?)""" execute_statement(statement(insert, db.place_holder), (file_id, commit_id, added, removed), cursor, db, "Couldn't insert patch, dup record?", exception=ExtensionRunError) connection.commit() progress.finished_one() cursor.close() connection.commit() connection.close() progress.done() profiler_stop("Running PatchLOC extension", delete=True) register_extension("PatchLOC", PatchLOC)
where id = ?""" if self.fixes_bug(commit_message): is_bug_fix = 1 else: is_bug_fix = 0 execute_statement(statement(update, db.place_holder), (is_bug_fix, row_id), write_cursor, db, "Couldn't update scmlog", exception=ExtensionRunError) read_cursor.close() connection.commit() connection.close() # This turns off the profiler and deletes its timings profiler_stop("Running BugFixMessage extension", delete=True) def backout(self, repo, uri, db): backout_statement = """update scmlog set is_bug_fix = NULL where repository_id = ?""" self._do_backout(repo, uri, db, backout_statement) register_extension("BugFixMessage", BugFixMessage)
def update_for_revision(self, cursor, commit_id, repo_id): db = self.__dict__['db'] if commit_id == self.__dict__['rev']: return prev_commit_id = self.__dict__['rev'] self.__dict__['rev'] = commit_id profiler_start("Updating adjacency matrix for commit %d", (commit_id,)) if self.__dict__['adj'] is None: adj = Adj() self.__dict__['adj'] = adj else: adj = self.__dict__['adj'] rf = self.__dict__['files'] if rf is not None: repo_files_id, repo_files = rf if repo_files_id != repo_id: del self.__dict__['files'] repo_files = {} else: repo_files = {} if not repo_files: # Get and cache all the files table query = "select id, file_name from files where repository_id = ?" # profiler_start("Getting files for repository %d", (repo_id,)) cursor.execute(statement(query, db.place_holder), (repo_id,)) # profiler_stop("Getting files for repository %d", (repo_id,), # True) rs = cursor.fetchmany() while rs: for id, file_name in rs: repo_files[id] = file_name rs = cursor.fetchmany() self.__dict__['files'] = (repo_id, repo_files) adj.files = repo_files # Get the files that have been renamed # with the new name for the given rev query = "select af.file_id, af.new_file_name " + \ "from actions_file_names af, files f " + \ "where af.file_id = f.id " + \ "and af.commit_id = ? " + \ "and af.type = 'V' " + \ "and f.repository_id = ?" # profiler_start("Getting new file names for commit %d", (commit_id,)) cursor.execute(statement(query, db.place_holder), (commit_id, repo_id)) # profiler_stop("Getting new file names for commit %d", (commit_id,), # True) rs = cursor.fetchmany() while rs: for id, file_name in rs: adj.files[id] = file_name rs = cursor.fetchmany() # Get the new file links since the last time query = "select fl.parent_id, fl.file_id " + \ "from file_links fl, files f " + \ "where fl.file_id = f.id " if prev_commit_id is None: query += "and fl.commit_id = ? " args = (commit_id, repo_id) else: query += "and fl.commit_id between ? and ? " args = (prev_commit_id, commit_id, repo_id) query += "and f.repository_id = ?" # profiler_start("Getting file links for commit %d", (commit_id,)) cursor.execute(statement(query, db.place_holder), args) # profiler_stop("Getting file links for commit %d", (commit_id,), True) rs = cursor.fetchmany() while rs: for f1, f2 in rs: adj.adj[f2] = f1 rs = cursor.fetchmany() profiler_stop("Updating adjacency matrix for commit %d", (commit_id,), True)
def run(self, repo, repo_uri): profiler_start("Processing patch for revision %s", (self.rev)) self.repo = repo self.repo_uri = repo_uri self.get_patch_for_commit() profiler_stop("Processing patch for revision %s", (self.rev))
(i,)) processed_jobs = self.__process_finished_jobs(job_pool, write_cursor, db) connection.commit() i = i - processed_jobs if processed_jobs < (queuesize / 5): job_pool.join() job_pool.join() self.__process_finished_jobs(job_pool, write_cursor, db) profiler_start("Inserting results in db") #self.__insert_many(write_cursor) connection.commit() profiler_stop("Inserting results in db") read_cursor.close() write_cursor.close() connection.close() # This turns off the profiler and deletes it's timings profiler_stop("Running content extension", delete=True) def backout(self, repo, uri, db): update_statement = """delete from content where commit_id in (select id from scmlog s where s.repository_id = ?)""" self._do_backout(repo, uri, db, update_statement)
execute_statement(statement(__insert__, self.db.place_holder), (patch_id, commit_id_new, file_id, old_class, new_class, old_function, new_function, 0), write_cursor, db, "\nCouldn't insert, duplicate patch?", exception=ExtensionRunError) #clear old_cla.clear() new_cla.clear() old_func.clear() new_func.clear() cnn.commit() write_cursor.close() cursor.close() cnn.close() profiler_stop("Running Patches extension", delete=True) end = time.time() print function_name_change_count, 'file change name!' print 'num of source file:', num_of_source print 'num of exception:', num_of_exception print 'num of non_source_file:', non_source_file print 'num of files can not be recovered:', num_of_unrecovered print 'num_of_id1:', num_of_id1 print 'consuming time: %ss' % str(end - start) register_extension("Analyse_patch", Analyse_patch)
class HunkBlameJob(Job): class BlameContentHandler(BlameJob.BlameContentHandler): def __init__(self, hunks): self.hunks = hunks self.bug_revs = {} def line(self, blame_line): if not self.profiled: profiler_start("Processing blame output for %s", (self.filename)) self.profiled = True for hunk_id, start_line, end_line in self.hunks: if blame_line.line >= start_line and blame_line.line <= end_line: if self.bug_revs.get(hunk_id) is None: self.bug_revs[hunk_id] = set() self.bug_revs[hunk_id].add(blame_line.rev) break def start_file(self, filename): self.filename = filename self.profiled = False def end_file(self): profiler_stop("Processing blame output for %s", (self.filename)) if len(self.bug_revs) == 0: printdbg("No bug revision found in this file") def __init__(self, hunks, path, rev): Job.__init__(self) self.hunks = hunks self.path = path self.rev = rev self.bug_revs = {} def run(self, repo, repo_uri): profiler_start("Running HunkBlameJob for %s@%s", (self.path, self.rev)) def blame_line(line, p): p.feed(line) start = sys.maxint end = 0 for hunk in self.hunks: if hunk[1] < start: start = hunk[1] if hunk[2] > end: end = hunk[2] repo_type = repo.get_type() if repo_type == 'cvs': # CVS paths contain the module stuff uri = repo.get_uri_for_path(repo_uri) module = uri[len(repo.get_uri()):].strip('/') if module != '.': path = self.path[len(module):].strip('/') else: path = self.path.strip('/') else: path = self.path.strip('/') p = create_parser(repo.get_type(), self.path) out = self.get_content_handler() p.set_output_device(out) wid = repo.add_watch(BLAME, blame_line, p) try: repo.blame(os.path.join(repo_uri, path), self.rev, start=start, end=end) self.collect_results(out) except RepositoryCommandError, e: self.failed = True printerr("Command %s returned %d (%s)", (e.cmd, e.returncode, e.error)) p.end() repo.remove_watch(BLAME, wid) profiler_stop("Running HunkBlameJob for %s@%s", (self.path, self.rev), delete=True)
if composed: rev = revision.split ("|")[0] else: rev = revision relative_path = fr.get_path () printdbg ("Path for %d at %s -> %s", (file_id, rev, relative_path)) if repo.get_type () == 'svn' and relative_path == 'tags': printdbg ("Skipping file %s", (relative_path,)) continue job = BlameJob (file_id, commit_id, relative_path, rev) job_pool.push (job) n_blames += 1 if n_blames >= self.MAX_BLAMES: job_pool.join() self.process_finished_jobs (job_pool, write_cursor) n_blames = 0 job_pool.join () self.process_finished_jobs (job_pool, write_cursor, True) read_cursor.close () write_cursor.close () cnn.close() profiler_stop ("Running Blame extension", delete = True) register_extension ("Blame", Blame)
(repo.get_type())) except Exception, e: raise ExtensionRunError( \ "Error creating repository %s. Exception: %s" % \ (repo.get_uri(), str(e))) profiler_start("Hunks: fetch all patches") icursor = ICursor(read_cursor, self.INTERVAL_SIZE) # Get the patches from this repository query = """select p.commit_id, p.patch, s.rev from patches p, scmlog s where p.commit_id = s.id and s.repository_id = ? and p.patch is not NULL""" icursor.execute(statement(query, db.place_holder), (repo_id, )) profiler_stop("Hunks: fetch all patches", delete=True) self.__prepare_table(connection) fp = FilePaths(db) rs = icursor.fetchmany() while rs: for commit_id, patch_content, rev in rs: for hunk in self.get_commit_data(patch_content): # Get the file ID from the database for linking hunk_file_name = re.sub(r'^[ab]\/', '', hunk.file_name.strip()) file_id = fp.get_file_id(hunk_file_name, commit_id) if file_id == None: printdbg("file not found")
set is_bug_fix = ? where id = ?""" if self.fixes_bug(commit_message): is_bug_fix = 1 else: is_bug_fix = 0 execute_statement(statement(update, db.place_holder), (is_bug_fix, row_id), write_cursor, db, "Couldn't update scmlog", exception=ExtensionRunError) read_cursor.close() connection.commit() connection.close() # This turns off the profiler and deletes its timings profiler_stop("Running BugFixMessage extension", delete=True) def backout(self, repo, uri, db): backout_statement = """update scmlog set is_bug_fix = NULL where repository_id = ?""" self._do_backout(repo, uri, db, backout_statement) register_extension("BugFixMessage", BugFixMessage)
finally: inner_cursor.close() hunks = [h for h in hunks if h[0] not in blames] job = HunkBlameJob(hunks, relative_path, pre_rev) job_pool.push (job) n_blames += 1 if n_blames >= self.MAX_BLAMES: processed_jobs = self.process_finished_jobs (job_pool, write_cursor) n_blames -= processed_jobs if processed_jobs<=self.MAX_BLAMES/5: profiler_start("Joining unprocessed jobs") job_pool.join() profiler_stop("Joining unprocessed jobs", delete=True) except NotValidHunkWarning as e: printerr("Not a valid hunk: "+str(e)) finally: file_rev = read_cursor.fetchone() job_pool.join () self.process_finished_jobs (job_pool, write_cursor, True) try: self.__drop_cache(cnn) except: printdbg("Couldn't drop cache because of " + str(e)) read_cursor.close () write_cursor.close ()
job = ContentJob(commit_id, file_id, rev, relative_path) job_pool.push(job) i = i + 1 if i >= queuesize: printdbg("Content queue is now at %d, flushing to database", (i,)) processed_jobs = self.__process_finished_jobs(job_pool, connection, db) i = i - processed_jobs if processed_jobs < (queuesize / 5): job_pool.join() job_pool.join() self.__process_finished_jobs(job_pool, connection, db) read_cursor.close() connection.close() # This turns off the profiler and deletes it's timings profiler_stop("Running content extension", delete=True) def backout(self, repo, uri, db): update_statement = """delete from content where commit_id in (select id from scmlog s where s.repository_id = ?)""" self._do_backout(repo, uri, db, update_statement) register_extension("Content", Content)
printdbg("FileCount queue is now at %d, flushing to database", (i, )) processed_jobs = self.__process_finished_jobs( job_pool, write_cursor, db) connection.commit() i = i - processed_jobs if processed_jobs < (queuesize / 5): job_pool.join() job_pool.join() self.__process_finished_jobs(job_pool, write_cursor, db) read_cursor.close() connection.commit() connection.close() # This turns off the profiler and deletes its timings profiler_stop("Running FileCount extension", delete=True) def backout(self, repo, uri, db): update_statement = """update scmlog set file_count = NULL where repository_id = ?""" self._do_backout(repo, uri, db, update_statement) register_extension("FileCount", FileCount)
def end_file (self): profiler_stop("Processing blame output for %s",(self.filename)) if len(self.bug_revs)==0: printdbg("No bug revision found in this file")
if i >= queuesize: printdbg("FileCount queue is now at %d, flushing to database", (i,)) processed_jobs = self.__process_finished_jobs(job_pool, write_cursor, db) connection.commit() i = i - processed_jobs if processed_jobs < (queuesize / 5): job_pool.join() job_pool.join() self.__process_finished_jobs(job_pool, write_cursor, db) read_cursor.close() connection.commit() connection.close() # This turns off the profiler and deletes its timings profiler_stop("Running FileCount extension", delete=True) def backout(self, repo, uri, db): update_statement = """update scmlog set file_count = NULL where repository_id = ?""" self._do_backout(repo, uri, db, update_statement) register_extension("FileCount", FileCount)
inner_cursor.close() hunks = [h for h in hunks if h[0] not in blames] job = HunkBlameJob(hunks, relative_path, pre_rev) job_pool.push(job) n_blames += 1 if n_blames >= self.MAX_BLAMES: processed_jobs = self.process_finished_jobs( job_pool, write_cursor) n_blames -= processed_jobs if processed_jobs <= self.MAX_BLAMES / 5: profiler_start("Joining unprocessed jobs") job_pool.join() profiler_stop("Joining unprocessed jobs", delete=True) except NotValidHunkWarning as e: printerr("Not a valid hunk: " + str(e)) finally: file_rev = read_cursor.fetchone() job_pool.join() self.process_finished_jobs(job_pool, write_cursor, True) try: self.__drop_cache(cnn) except: printdbg("Couldn't drop cache because of " + str(e)) read_cursor.close() write_cursor.close()
(i, )) processed_jobs = self.__process_finished_jobs( job_pool, write_cursor, db) connection.commit() i = i - processed_jobs if processed_jobs < (queuesize / 5): job_pool.join() job_pool.join() self.__process_finished_jobs(job_pool, write_cursor, db) profiler_start("Inserting results in db") #self.__insert_many(write_cursor) connection.commit() profiler_stop("Inserting results in db") read_cursor.close() write_cursor.close() connection.close() # This turns off the profiler and deletes it's timings profiler_stop("Running content extension", delete=True) def backout(self, repo, uri, db): update_statement = """delete from content where commit_id in (select id from scmlog s where s.repository_id = ?)""" self._do_backout(repo, uri, db, update_statement)
def end_file(self): profiler_stop("Processing blame output for %s", (self.filename)) if len(self.bug_revs) == 0: printdbg("No bug revision found in this file")
(repo.get_type())) except Exception, e: raise ExtensionRunError( \ "Error creating repository %s. Exception: %s" % \ (repo.get_uri(), str(e))) profiler_start("Hunks: fetch all patches") icursor = ICursor(read_cursor, self.INTERVAL_SIZE) # Get the patches from this repository query = """select p.commit_id, p.patch, s.rev from patches p, scmlog s where p.commit_id = s.id and s.repository_id = ? and p.patch is not NULL""" icursor.execute(statement(query, db.place_holder), (repo_id,)) profiler_stop("Hunks: fetch all patches", delete=True) self.__prepare_table(connection) fp = FilePaths(db) rs = icursor.fetchmany() while rs: for commit_id, patch_content, rev in rs: for hunk in self.get_commit_data(patch_content): # Get the file ID from the database for linking hunk_file_name = re.sub(r'^[ab]\/', '', hunk.file_name.strip()) file_id = fp.get_file_id(hunk_file_name, commit_id) if file_id == None: printdbg("file not found")