def __create_cache(self, cnn): cursor = cnn.cursor () try: self.__drop_cache(cnn) except Exception, e: printdbg("Couldn't drop cache because of " + str(e))
def get_path_from_database(self, file_id, commit_id): """Returns the last valid path for a given file_id at commit_id (May have been removed afterwords!)""" if config.debug: profiler_start("Getting full file path for file_id %d and \ commit_id %d", (file_id, commit_id)) db = self.__dict__['db'] cnn = db.connect() cursor = cnn.cursor() query = """SELECT current_file_path from actions WHERE file_id=? AND commit_id <= ? ORDER BY commit_id DESC LIMIT 1""" cursor.execute(statement(query, db.place_holder), (file_id, commit_id)) try: file_path = cursor.fetchone()[0] except: file_path = None cursor.close() cnn.close() printdbg("get_path_from_database:\ Path for file_id %d at commit_id %d: %s", (file_id, commit_id, file_path)) if config.debug: profiler_stop("Getting full file path for file_id %d and\ commit_id %d", (file_id, commit_id), True) return file_path
def get_line_types(repo, repo_uri, rev, path): """Returns an array, where each item means a line of code. Each item is labled 'code', 'comment' or 'empty'""" #profiler_start("Processing LineTypes for revision %s:%s", (self.rev, self.file_path)) uri = os.path.join(repo_uri, path) # concat repo_uri and file_path for full path file_content = _get_file_content(repo, uri, rev) # get file_content if file_content is None or file_content == '': printerr("[get_line_types] Error: No file content for " + str(rev) + ":" + str(path) + " found! Skipping.") line_types = None else: try: lexer = get_lexer_for_filename(path) except ClassNotFound: try: printdbg("[get_line_types] Guessing lexer for" + str(rev) + ":" + str(path) + ".") lexer = guess_lexer(file_content) except ClassNotFound: printdbg("[get_line_types] No guess or lexer found for " + str(rev) + ":" + str(path) + ". Using TextLexer instead.") lexer = TextLexer() if isinstance(lexer, NemerleLexer): # this lexer is broken and yield an unstoppable process # see https://bitbucket.org/birkenfeld/pygments-main/issue/706/nemerle-lexer-ends-in-an-infinite-loop lexer = TextLexer() # Not shure if this should be skipped, when the language uses off-side rules (e.g. python, # see http://en.wikipedia.org/wiki/Off-side_rule for list) stripped_code = _strip_lines(file_content) lexer_output = _iterate_lexer_output(lexer.get_tokens(stripped_code)) line_types_str = _comment_empty_or_code(lexer_output) line_types = line_types_str.split("\n") return line_types
def get_path_from_database(self, file_id, commit_id): """Returns the last valid path for a given file_id at commit_id (May have been removed afterwords!)""" if config.debug: profiler_start("Getting full file path for file_id %d and \ commit_id %d", (file_id, commit_id)) db = self.__dict__['db'] cnn = db.connect() cursor = cnn.cursor() query = """SELECT file_path from file_paths WHERE file_id=? AND commit_id <= ? ORDER BY commit_id DESC LIMIT 1""" cursor.execute(statement(query, db.place_holder), (file_id, commit_id)) try: file_path = cursor.fetchone()[0] except: file_path = None cursor.close() cnn.close() printdbg("get_path_from_database:\ Path for file_id %d at commit_id %d: %s", (file_id, commit_id, file_path)) if config.debug: profiler_stop("Getting full file path for file_id %d and\ commit_id %d", (file_id, commit_id), True) return file_path
def __create_cache(self, cnn): cursor = cnn.cursor() try: self.__drop_cache(cnn) except Exception, e: printdbg("Couldn't drop cache because of " + str(e))
def line_is_code(line_types_array, line_nr): """Decides if a given line nr is executable code""" try: line_type = line_types_array[line_nr - 1] except IndexError as e: printdbg("Line lexer output. Must be an empty line!") line_type = None return line_type == "code"
def line_is_code(line_types_array, line_nr): """Decides if a given line nr is executable code""" try: line_type = line_types_array[line_nr-1] except IndexError as e: printdbg("Line lexer output. Must be an empty line!") line_type = None return line_type == "code"
def __match_string(self, regexes, flags, string): """Checks whether a string matches a series of regexes""" for r in regexes: # The bit at the beginning and end matches whitespace, punctuation # or the start or end of a line. delimiters = "[\s\.,;\!\?\'\"\/\\\]" if re.search("(" + delimiters + "+|^)" + r + "(" + delimiters + "+|$)", string, flags): printdbg("[STRING] matched on " + str(r) + " " + string) return True return False
def populate_insert_args(self, job): authors = job.get_authors() file_id = job.get_file_id() commit_id = job.get_commit_id() try: args = [(self.id_counter + i, file_id, commit_id, \ self.authors[key], authors[key]) \ for i, key in enumerate(authors.keys())] except: printdbg("Error occurred while processing file %d @ commit %d", (file_id, commit_id)) raise return args
def statement(str, ph_mark): '''docstring demo''' if "?" == ph_mark or "?" not in str: printdbg(str) return str tokens = str.split("'") for i in range(0, len(tokens), 2): tokens[i] = tokens[i].replace("?", ph_mark) retval = "'".join(tokens) printdbg(retval) return retval
def __execute(self): """ Here some documentation comes. This is test for bestfork """ q = "%s LIMIT %d OFFSET %d" % (self.query, self.interval_size, self.i) self.i += self.interval_size printdbg(q) if self.args: self.cursor.execute(q, self.args) else: self.cursor.execute(q) self.need_exec = False
def populate_insert_args(self, job): bug_revs = job.get_bug_revs () cnn = self.db.connect() cursor = cnn.cursor() args = [] for hunk_id in bug_revs: for rev in bug_revs[hunk_id]: printdbg("Find id for rev %s"%rev) query = "select id from scmlog where rev = ?" cursor.execute(statement(query, self.db.place_holder),(rev,)) fetched_row = cursor.fetchone() if fetched_row is not None: args.append((hunk_id,fetched_row[0])) cursor.close() cnn.close() return args
def populate_insert_args(self, job): bug_revs = job.get_bug_revs() cnn = self.db.connect() cursor = cnn.cursor() args = [] for hunk_id in bug_revs: for rev in bug_revs[hunk_id]: printdbg("Find id for rev %s" % rev) query = "select id from scmlog where rev = ?" cursor.execute(statement(query, self.db.place_holder), (rev, )) fetched_row = cursor.fetchone() if fetched_row is not None: args.append((hunk_id, fetched_row[0])) cursor.close() cnn.close() return args
def get_all_extensions(): # Do something to get a list of extensions, probably like a file # glob, then do a get_extension on each one. Return the entire # _extensions list # Get a list of the paths that are sitting in the directory with this # script, ie. all possible extensions possible_file_paths = glob(os.path.realpath(os.path.dirname(__file__)) \ + "/*.py") # This splitting will extract the file name from the expression. # The list has special Python files, like __init.py__ filtered. for extension in [os.path.splitext(os.path.split(fp)[1])[0] for fp in possible_file_paths if (not fp.startswith('__') and not fp.endswith('__.py'))]: try: printdbg("Getting extension " + extension) get_extension(extension) except ExtensionUnknownError: pass return _extensions
def run(self, repo, uri, db): self.db = db self.repo = repo path = uri_to_filename(uri) if path is not None: repo_uri = repo.get_uri_for_path(path) else: repo_uri = uri path = uri_to_filename(uri) self.repo_uri = path or repo.get_uri() cnn = self.db.connect() cursor = cnn.cursor() cursor.execute( statement("SELECT id from repositories where uri = ?", db.place_holder), (repo_uri, )) repo_id = cursor.fetchone()[0] # If table does not exist, the list of commits is empty, # otherwise it will be filled within the except block below commits = [] try: printdbg("Creating patches table") self.__create_table(cnn) except TableAlreadyExists: printdbg("Patches table exists already, getting max ID") cursor.execute( statement("SELECT max(id) from patches", db.place_holder)) id = cursor.fetchone()[0] if id is not None: DBPatch.id_counter = id + 1 commits = self.__get_patches_for_repository(repo_id, cursor) except Exception, e: raise ExtensionRunError(str(e))
def run(self, repo, uri, db): profiler_start("Running Patches extension") self.db = db self.repo = repo path = uri_to_filename(uri) if path is not None: repo_uri = repo.get_uri_for_path(path) else: repo_uri = uri path = uri_to_filename(uri) self.repo_uri = path or repo.get_uri() cnn = self.db.connect() cursor = cnn.cursor() cursor.execute(statement("SELECT id from repositories where uri = ?", db.place_holder), (repo_uri,)) repo_id = cursor.fetchone()[0] # If table does not exist, the list of commits is empty, # otherwise it will be filled within the except block below commits = [] try: printdbg("Creating patches table") self.__create_table(cnn) except TableAlreadyExists: printdbg("Patches table exists already, getting max ID") cursor.execute(statement("SELECT max(id) from patches", db.place_holder)) id = cursor.fetchone()[0] if id is not None: DBPatch.id_counter = id + 1 commits = self.__get_patches_for_repository(repo_id, cursor) except Exception, e: raise ExtensionRunError(str(e))
def close(self): """Closes FilePaths to ensure all caches are deleted""" if Config().low_memory: # FIXME: This should be closed, but sometimes shelve # just won't do it. The best way is to timeout the try, # but not closing and just deleting will do the same # think, just in a more yucky way printdbg("Syncing shelf") self.__dict__['cached_adj'].sync() printdbg("Closing shelf") self.__dict__['cached_adj'].close() printdbg("Deleting shelve " + self.shelve_file_name) os.remove(self.shelve_file_name) # Clean up cached adj in case this gets called without # update_all later self.__dict__['cached_adj'] = {}
# Get code files query = "select f.id from file_types ft, files f " + \ "where f.id = ft.file_id and " + \ "ft.type in ('code', 'unknown') and " + \ "f.repository_id = ?" read_cursor.execute (statement (query, db.place_holder), (repoid,)) code_files = [item[0] for item in read_cursor.fetchall ()] n_blames = 0 fr = FileRevs (db, cnn, read_cursor, repoid) for revision, commit_id, file_id, action_type, composed in fr: if file_id not in code_files: continue if (file_id, commit_id) in blames: printdbg ("%d@%d is already in the database, skip it", (file_id, commit_id)) continue if composed: rev = revision.split ("|")[0] else: rev = revision relative_path = fr.get_path () printdbg ("Path for %d at %s -> %s", (file_id, rev, relative_path)) if repo.get_type () == 'svn' and relative_path == 'tags': printdbg ("Skipping file %s", (relative_path,)) continue job = BlameJob (file_id, commit_id, relative_path, rev)
self.__prepare_table(connection) i = 0 for row in read_cursor: row_id = row[0] rev = row[1] job = FileCountJob(row_id, rev) job_pool.push(job) i = i + 1 if i >= queuesize: printdbg("FileCount queue is now at %d, flushing to database", (i, )) processed_jobs = self.__process_finished_jobs( job_pool, write_cursor, db) connection.commit() i = i - processed_jobs if processed_jobs < (queuesize / 5): job_pool.join() job_pool.join() self.__process_finished_jobs(job_pool, write_cursor, db) read_cursor.close() connection.commit() connection.close()
and h.file_id is not null and h.commit_id is not null """ read_cursor.execute(statement (outer_query, db.place_holder), (repoid,)) file_rev = read_cursor.fetchone() n_blames = 0 fp = FilePaths(db) fp.update_all(repoid) while file_rev is not None: try: file_id, commit_id = file_rev pre_commit_id, pre_rev = self.__find_previous_commit(file_id, commit_id) relative_path = fp.get_path(file_id, pre_commit_id, repoid) if relative_path is None: raise NotValidHunkWarning("Couldn't find path for file ID %d"%file_id) printdbg ("Path for %d at %s -> %s", (file_id, pre_rev, relative_path)) try: inner_cursor = cnn.cursor() inner_query = """select h.id, h.old_start_line, h.old_end_line from hunks h where h.file_id = ? and h.commit_id = ? and h.old_start_line is not null and h.old_end_line is not null and h.file_id is not null and h.commit_id is not null """ inner_cursor.execute(statement(inner_query, db.place_holder), (file_id, commit_id)) hunks = inner_cursor.fetchall() #FIXME except Exception as e:
except Exception, e: raise ExtensionRunError( \ "Error creating repository %s. Exception: %s" % \ (repo.get_uri(), str(e))) # Try to create a table for storing the content # TODO: Removed use case for choosing between all or just the HEAD, # should ideally put that back again. Just all for now is fine. try: self.__prepare_table(connection) except Exception as e: raise ExtensionRunError("Couldn't prepare table because " + \ str(e)) queuesize = Config().max_threads printdbg("Setting queuesize to " + str(queuesize)) # This is where the threading stuff comes in, I expect job_pool = JobPool(repo, path or repo.get_uri(), queuesize=queuesize) # This filters files if they're not source files. # I'm pretty sure "unknown" is returning binary files too, but # these are implicitly left out when trying to convert to utf-8 # after download. However, ignore them for now to speed things up query = "select f.id from file_types ft, files f " + \ "where f.id = ft.file_id and " + \ "ft.type in('code') and " + \ "f.repository_id = ?" # "ft.type in('code', 'unknown') and " + \ read_cursor.execute(statement(query, db.place_holder), (repo_id, )) code_files = [item[0] for item in read_cursor.fetchall()]
read_cursor.execute(statement(outer_query, db.place_holder), (repoid, )) file_rev = read_cursor.fetchone() n_blames = 0 fp = FilePaths(db) fp.update_all(repoid) while file_rev is not None: try: file_id, commit_id = file_rev pre_commit_id, pre_rev = self.__find_previous_commit( file_id, commit_id) relative_path = fp.get_path(file_id, pre_commit_id, repoid) if relative_path is None: raise NotValidHunkWarning( "Couldn't find path for file ID %d" % file_id) printdbg("Path for %d at %s -> %s", (file_id, pre_rev, relative_path)) try: inner_cursor = cnn.cursor() inner_query = """select h.id, h.old_start_line, h.old_end_line from hunks h where h.file_id = ? and h.commit_id = ? and h.old_start_line is not null and h.old_end_line is not null and h.file_id is not null and h.commit_id is not null """ inner_cursor.execute( statement(inner_query, db.place_holder), (file_id, commit_id)) hunks = inner_cursor.fetchall()
while rs: for commit_id, revision, composed_rev in rs: if commit_id in commits: continue if composed_rev: rev = revision.split ("|")[0] else: rev = revision job = PatchJob(rev, commit_id) job_pool.push(job) i = i + 1 if i >= queuesize: printdbg("Queue is now at %d, flushing to database", (i,)) job_pool.join() self.__process_finished_jobs(job_pool, write_cursor, db) cnn.commit() i = 0 cnn.commit() rs = icursor.fetchmany () job_pool.join() self.__process_finished_jobs(job_pool, write_cursor, db) cnn.commit () write_cursor.close () cursor.close () cnn.close ()
def get_commit_data(self, patch_content): lines = [l + "\n" for l in patch_content.split("\n") if l] hunks = [] for patch in [p for p in parse_patches(lines, allow_dirty=True, \ allow_continue=True) if isinstance(p, Patch)]: # This method matches that of parseLine in UnifiedDiffParser.java # It's not necessarily intuitive, but this algorithm is much harder # than it looks, I spent hours trying to get a simpler solution. # It does, however, seem to work, which is pretty amazing when # you think about how difficult it is for long enough. # The trick that this method does is that each *part* of a hunk # ie. added, deleted, changed are treated as *new entities*. # The EntityDelta table does not store just diffs, it stores # each part of a diff. # I will need to copy the behavior of how Sep inserts a CommitData # into the database to ensure things match for hunk in patch.hunks: old_start_line = hunk.orig_pos - 1 new_start_line = hunk.mod_pos - 1 old_end_line = 0 new_end_line = 0 added = False deleted = False in_change = False for line in hunk.lines: if isinstance(line, RemoveLine): if not in_change or not deleted: in_change = True old_start_line += 1 old_end_line = old_start_line else: old_end_line += 1 deleted = True elif isinstance(line, InsertLine): if not in_change or not added: in_change = True new_start_line += 1 new_end_line = new_start_line else: new_end_line += 1 added = True elif isinstance(line, ContextLine): if in_change: in_change = False printdbg("Patch new name: " + patch.newname) file_name = patch.newname.strip() if file_name == "/dev/null": file_name = patch.oldname.strip() cd = CommitData(file_name) if deleted: cd.old_start_line = old_start_line cd.old_end_line = old_end_line old_start_line = old_end_line if added: cd.new_start_line = new_start_line cd.new_end_line = new_end_line new_start_line = new_end_line hunks.append(cd) added = deleted = False old_start_line += 1 new_start_line += 1 # The diff ended without a new context line if in_change: file_name = patch.newname.strip() if file_name == "/dev/null": file_name = patch.oldname.strip() cd = CommitData(file_name) if deleted: cd.old_start_line = old_start_line cd.old_end_line = old_end_line if added: cd.new_start_line = new_start_line cd.new_end_line = new_end_line hunks.append(cd) return hunks
repo_id = read_cursor.fetchone()[0] except NotImplementedError: raise ExtensionRunError("Content extension is not supported for %s repos" % (repo.get_type())) except Exception, e: raise ExtensionRunError("Error creating repository %s. Exception: %s" % (repo.get_uri(), str(e))) # Try to create a table for storing the content # TODO: Removed use case for choosing between all or just the HEAD, # should ideally put that back again. Just all for now is fine. try: self.__prepare_table(connection) except Exception as e: raise ExtensionRunError("Couldn't prepare table because " + str(e)) queuesize = self.MAX_THREADS printdbg("Setting queuesize to " + str(queuesize)) # This is where the threading stuff comes in, I expect job_pool = JobPool(repo, path or repo.get_uri(), queuesize=queuesize) # This filters files if they're not source files. # I'm pretty sure "unknown" is returning binary files too, but # these are implicitly left out when trying to convert to utf-8 # after download. However, ignore them for now to speed things up query = ( "select f.id from file_types ft, files f " + "where f.id = ft.file_id and " + "ft.type in('code') and " + "f.repository_id = ?" ) # "ft.type in('code', 'unknown') and " + \
def run(self, repo, uri, db): #record how many patches contains different file name function_name_change_count = 0 #only suitable for my computer, user can change according to your own settings prefix = r'/home/moqi/Downloads/voldemort' #old file name f_of_old = open('/home/moqi/Downloads/voldemort/old', 'w') #new file name f_of_new = open('/home/moqi/Downloads/voldemort/new', 'w') #store information returns by search_lines search_result={} #number of exception #such as /null and file has been deleted so that can not open #not accurate num_of_exception = 0 #number of file which do not belong to source files non_source_file = 0 #number of patch which commit_id = 1 num_of_id1 = 0 #number of files can not be recovered num_of_unrecovered = 0 #old_cla contains class definition in old file old_cla = sets.Set() new_cla = sets.Set() old_func = sets.Set() new_func = sets.Set() #max id in table patches id_max = 0 #patch_id patch_id = 0 #file_id file_id = 0 ##old_class, new_class, old_function, new_function old_class = '' new_class = '' old_function = '' new_function = '' __insert__ = """INSERT INTO analyse_patch (patch_id, commit_id, file_id, old_class, new_class, old_function, new_function, if_id1) values (?, ?, ?, ?, ?, ?, ?, ?)""" start = time.time() profiler_start("Running analyse_patch extension") self.db = db self.repo = repo path = uri_to_filename(uri) if path is not None: repo_uri = repo.get_uri_for_path(path) ##added by me prefix = path else: repo_uri = uri path = uri_to_filename(uri) self.repo_uri = path or repo.get_uri() cnn = self.db.connect() cursor = cnn.cursor() write_cursor = cnn.cursor() cursor.execute(statement("SELECT id from repositories where uri = ?", db.place_holder), (repo_uri,)) repo_id = cursor.fetchone()[0] try: printdbg("Creating analyse_patch table") self.__create_table(cnn) except TableAlreadyExists: pass except Exception, e: raise ExtensionRunError(str(e))
def end_file (self): profiler_stop("Processing blame output for %s",(self.filename)) if len(self.bug_revs)==0: printdbg("No bug revision found in this file")
self.__prepare_table(connection) fp = FilePaths(db) patches = self.get_patches(repo, path or repo.get_uri(), repo_id, db, read_cursor) for commit_id, patch_content, rev in patches: for hunk in self.get_commit_data(patch_content): # Get the file ID from the database for linking hunk_file_name = re.sub(r'^[ab]\/', '', hunk.file_name.strip()) file_id = fp.get_file_id(hunk_file_name, commit_id) if file_id == None: printdbg("file not found") if repo.type == "git": # The liklihood is that this is a merge, not a # missing ID from some data screwup. # We'll just continue and throw this away continue else: printerr("No file ID found for hunk " + \ hunk_file_name + \ " at commit " + str(commit_id)) insert = """insert into hunks(file_id, commit_id, old_start_line, old_end_line, new_start_line, new_end_line) values(?,?,?,?,?,?)"""
profiler_stop("Hunks: fetch all patches", delete=True) self.__prepare_table(connection) fp = FilePaths(db) rs = icursor.fetchmany() while rs: for commit_id, patch_content, rev in rs: for hunk in self.get_commit_data(patch_content): # Get the file ID from the database for linking hunk_file_name = re.sub(r'^[ab]\/', '', hunk.file_name.strip()) file_id = fp.get_file_id(hunk_file_name, commit_id) if file_id == None: printdbg("file not found") if repo.type == "git": # The liklihood is that this is a merge, not a # missing ID from some data screwup. # We'll just continue and throw this away continue else: printerr("No file ID found for hunk " + \ hunk_file_name + \ " at commit " + commit_id) insert = """insert into hunks(file_id, commit_id, old_start_line, old_end_line, new_start_line, new_end_line) values(?,?,?,?,?,?)"""
def end_file(self): profiler_stop("Processing blame output for %s", (self.filename)) if len(self.bug_revs) == 0: printdbg("No bug revision found in this file")
self.__prepare_table(connection) i = 0 for row in read_cursor: row_id = row[0] rev = row[1] job = FileCountJob(row_id, rev) job_pool.push(job) i = i + 1 if i >= queuesize: printdbg("FileCount queue is now at %d, flushing to database", (i,)) processed_jobs = self.__process_finished_jobs(job_pool, write_cursor, db) connection.commit() i = i - processed_jobs if processed_jobs < (queuesize / 5): job_pool.join() job_pool.join() self.__process_finished_jobs(job_pool, write_cursor, db) read_cursor.close() connection.commit() connection.close()