def get_commit_editing_paths(sqlite_db_file, time_from=None, time_to=None, filename=None): """ Returns DAG of commits where an edge between commit A and B indicates that lines written in commit A where changes in commit B. Further outputs editing paths extracted from the DAG. Node and edge infos set up to be expanded with future releases. Args: sqlite_db_file: path to sqlite database time_from: start time of time window filter, datetime object time_to: end time of time window filter, datetime object filename: filter to obtain only commits editing a certain file Returns: paths: pathpy path object capturing editing paths dag: pathpy dag object linking commits node_info: info on node charactaristics edge_info: info on edge characteristics """ con = sqlite3.connect(sqlite_db_file) data = pd.read_sql( """SELECT edits.original_commit_deletion AS pre_commit, edits.commit_hash AS post_commit, edits.filename, commits.author_date AS time FROM edits JOIN commits ON edits.commit_hash = commits.hash""", con).drop_duplicates() if filename is not None: data = data.loc[data.filename == filename, :] if time_from == None: time_from = datetime.datetime.strptime(min(data.time), '%Y-%m-%d %H:%M:%S') if time_to == None: time_to = datetime.datetime.strptime(max(data.time), '%Y-%m-%d %H:%M:%S') node_info = {} edge_info = {} dag = pp.DAG() for idx, row in data.iterrows(): if (datetime.datetime.strptime(row.time, '%Y-%m-%d %H:%M:%S') >= time_from) and \ (datetime.datetime.strptime(row.time, '%Y-%m-%d %H:%M:%S') <= time_to): dag.add_edge(row.pre_commit, row.post_commit) dag.topsort() assert dag.is_acyclic is True paths = pp.path_extraction.paths_from_dag(dag) return paths, dag, node_info, edge_info
def dag_object(): dag = pp.DAG() # For this DAG, the following five paths between the root and the leaves exist # for the following mapping: # mapping = {'a': 'A', 'b': 'B', 'c': 'A', 'e': 'B', # 'f': 'B', 'g': 'A', 'h': 'A','i': 'B', 'j': 'A' } # h -> i ( A -> B ) # h -> j ( A -> A ) # a -> b -> e ( A -> B -> B ) # a -> c -> g ( A -> A -> A ) # a -> b -> f -> g ( A -> B -> B -> A ) # a -> c -> b -> e ( A -> A -> B -> B ) # a -> c -> b -> f -> g ( A -> A -> B -> B -> A ) dag.add_edge('a', 'b') dag.add_edge('a', 'c') dag.add_edge('c', 'b') dag.add_edge('b', 'e') dag.add_edge('b', 'f') dag.add_edge('f', 'g') dag.add_edge('c', 'g') dag.add_edge('h', 'i') dag.add_edge('h', 'j') return dag
def get_line_editing_paths(sqlite_db_file, commit_hashes=None, file_paths=None, with_start=False, merge_renaming=True): """ Returns line editing DAG as well as line editing paths. Node and edge infos set up to be expanded with future releases. Args: sqlite_db_file: path to sqlite database mined with git2net line method commit_hashes: list of commits to consider, by default all commits are considered file_paths: list of files to consider, by defailt all files are considered with_start: bool, determines if node for filename is included as start for all editing pahts merge_renaming: bool, determines if file renaming is considered Returns: paths: line editing pahts, pathpy Path object dag: line editing directed acyclic graph, pathpy DAG object node_info: info on node charactaristics edge_info: info on edge characteristics """ # Connect to provided database. con = sqlite3.connect(sqlite_db_file) # Check if database is valid. try: path = con.execute("SELECT repository FROM _metadata").fetchall()[0][0] method = con.execute("SELECT method FROM _metadata").fetchall()[0][0] if method == 'blocks': raise Exception("Invalid database. A database mined with 'use_blocks=False' is " + "required.") except sqlite3.OperationalError: raise Exception("You either provided no database or a database not created with git2net. " + "Please provide a valid datatabase mined with 'use_blocks=False'.") if merge_renaming: print('Searching for aliases') # Identify files that have been renamed. _, aliases = git2net.identify_file_renaming(path) dag = pp.DAG() node_info = {} node_info['colors'] = {} node_info['time'] = {} # node_info['file_paths'] = {} # node_info['edit_distance'] = {} edge_info = {} edge_info['colors'] = {} edge_info['weights'] = {} # Extract required data from the provided database. print('Querying commits') commits = pd.read_sql("""SELECT hash, author_name, author_date FROM commits""", con) print('Querying edits') edits = pd.DataFrame() no_of_edits = pd.read_sql("""SELECT count(*) FROM edits""", con).iloc[0, 0] chunksize = 1000 for edits in tqdm(pd.read_sql("""SELECT levenshtein_dist, old_path, new_path, commit_hash, original_commit_deletion, original_commit_addition, original_line_no_deletion, original_line_no_addition, original_file_path_deletion, original_file_path_addition, post_starting_line_no, edit_type FROM edits""", con, chunksize=chunksize), total = math.ceil(no_of_edits / chunksize)): # Filter edits table if only edits from specific commits are considered. if commit_hashes is not None: edits = edits.loc[[x in commit_hashes for x in edits.commit_hash], :] # Rename file paths to latest name if option is selected. if merge_renaming: # Update their name in the edits table. for key, value in aliases.items(): edits.replace(key, value[0], inplace=True) # Filter edits table if specific files are considered. Has to be done after renaming. if file_paths is not None: edits = edits.loc[[x in file_paths for x in edits.new_path], :] # Get author and date of deletions. edits = pd.merge(edits, commits, how='left', left_on='original_commit_deletion', right_on='hash').drop(['hash'], axis=1) edits.rename(columns = {'author_name':'author_name_deletion', 'author_date': 'author_date_deletion'}, inplace = True) # Get author and date of additions. edits = pd.merge(edits, commits, how='left', left_on='original_commit_addition', right_on='hash').drop(['hash'], axis=1) edits.rename(columns = {'author_name':'author_name_addition', 'author_date': 'author_date_addition'}, inplace = True) # Get current author and date edits = pd.merge(edits, commits, how='left', left_on='commit_hash', right_on='hash').drop(['hash'], axis=1) file_paths = set() # Sort edits by author date. #print('Sorting edits') #edits.sort_values('author_date', ascending=True, inplace=True) for _, edit in edits.iterrows(): if edit.edit_type == 'replacement': # Generate name of target node. target = 'L' + str(int(edit.post_starting_line_no)) + ' ' + \ edit.new_path + ' ' + \ edit.commit_hash # Source of deletion must exist. source_deletion = 'L' + str(int(edit.original_line_no_deletion)) + ' ' + \ edit.original_file_path_deletion + ' ' + \ edit.original_commit_deletion dag.add_edge(source_deletion, target) edge_info['colors'][(source_deletion, target)] = 'white' edge_info['weights'][(source_deletion, target)] = edit.levenshtein_dist node_info['time'][target] = edit.author_date node_info['time'][source_deletion] = edit.author_date_deletion # Check id source of addition exists. if edit.original_commit_addition is not None: source_addition = 'L' + str(int(edit.original_line_no_addition)) + ' ' + \ edit.original_file_path_addition + ' ' + \ edit.original_commit_addition dag.add_edge(source_addition, target) edge_info['colors'][(source_addition, target)] = '#FBB13C' # yellow edge_info['weights'][(source_addition, target)] = edit.levenshtein_dist node_info['time'][target] = edit.author_date node_info['time'][source_addition] = edit.author_date_addition elif edit.edit_type == 'deletion': # An edit in a file can only change lines in that file, not in the file the line was # copied from. if edit.original_file_path_deletion == edit.old_path: # Generate name of target node. target = 'deleted L' + str(int(edit.original_line_no_deletion)) + ' ' + \ edit.original_file_path_deletion + ' ' + \ edit.original_commit_deletion # Source of deletion must exist. source_deletion = 'L' + str(int(edit.original_line_no_deletion)) + ' ' + \ edit.original_file_path_deletion + ' ' + \ edit.original_commit_deletion dag.add_edge(source_deletion, target) edge_info['colors'][(source_deletion, target)] = 'white' edge_info['weights'][(source_deletion, target)] = edit.levenshtein_dist node_info['time'][target] = edit.author_date node_info['time'][source_deletion] = edit.author_date_deletion # else: # print(edit) # copied_from = 'L' + str(int(edit.original_line_no_deletion)) + ' ' + \ # edit.original_file_path_deletion + ' ' + \ # edit.original_commit_deletion # copied_to = 'L' + str(int(edit.post_starting_line_no)) + ' ' + \ # edit.new_path + ' ' + \ # edit.commit_hash # #found_copied_to = False # #for copied_to in dag.successors[copied_from]: # # if copied_to.split(' ')[1] == edit.old_path: # # found_copied_to = True # # break # #assert found_copied_to # dag.add_edge(copied_to, 'deleted ' + copied_to) # edge_info['colors'][(copied_to, 'deleted ' + copied_to)] = 'white' # edge_info['weights'][(copied_to, 'deleted ' + copied_to)] = edit.levenshtein_dist elif edit.edit_type == 'addition': # Generate name of target node. target = 'L' + str(int(edit.post_starting_line_no)) + ' ' + \ edit.new_path + ' ' + \ edit.commit_hash # Add file path as source and add file path to file_paths list. source = edit.new_path file_paths.add(source) dag.add_edge(source, target) edge_info['colors'][(source, target)] = 'gray' edge_info['weights'][(source, target)] = edit.levenshtein_dist node_info['time'][target] = edit.author_date # Check id source of addition exists. if edit.original_commit_addition is not None: source_addition = 'L' + str(int(edit.original_line_no_addition)) + ' ' + \ edit.original_file_path_addition + ' ' + \ edit.original_commit_addition dag.add_edge(source_addition, target) edge_info['colors'][(source_addition, target)] = '#FBB13C' edge_info['weights'][(source_addition, target)] = edit.levenshtein_dist node_info['time'][target] = edit.author_date node_info['time'][source_addition] = edit.author_date_addition elif edit.edit_type == 'file_renaming': pass else: raise Exception("Unexpected error in 'extract_editing_paths'.") for node in tqdm(dag.nodes): if node in file_paths: node_info['colors'][node] = 'gray' else: if '#FBB13C' in [edge_info['colors'][n] for n in [(x, node) for x in dag.predecessors[node]]]: node_info['colors'][node] = '#FBB13C' # yellow elif node.startswith('deleted'): node_info['colors'][node] = '#A8322D' # red elif 'white' not in [edge_info['colors'][n] for n in [(node, x) for x in dag.successors[node]]]: node_info['colors'][node] = '#2E5EAA' # blue elif not dag.predecessors[node].isdisjoint(file_paths): node_info['colors'][node] = '#218380' # green else: node_info['colors'][node] = '#73D2DE' # light blue if not with_start: for file_path in file_paths: dag.remove_node(file_path) dag.topsort() assert dag.is_acyclic is True paths = pp.path_extraction.paths_from_dag(dag) return paths, dag, node_info, edge_info
def get_commit_editing_dag(sqlite_db_file, time_from=None, time_to=None, filename=None): """ Returns DAG of commits where an edge between commit A and B indicates that lines written in commit A were changed in commit B. Further outputs editing paths extracted from the DAG. :param str sqlite_db_file: path to SQLite database :param datetime.datetime time_from: start time of time window filter, datetime object :param datetime.datetime time_to: end time of time window filter, datetime object :param str filename: filter to obtain only commits editing a certain file :return: - *pathpy.DAG* – commit editing dag - *dict* – info on node charactaristics - *dict* – info on edge characteristics """ con = sqlite3.connect(sqlite_db_file) data = pd.read_sql( """SELECT edits.original_commit_deletion AS pre_commit, edits.commit_hash AS post_commit, edits.filename, commits.author_date AS time, commits.author_timezone as timezone FROM edits JOIN commits ON edits.commit_hash = commits.hash""", con).drop_duplicates() if filename is not None: data = data.loc[data.filename == filename, :] data['time'] = [ int(t / (10**9) - tz) for t, tz in zip( pd.to_datetime(data.time, format='%Y-%m-%d %H:%M:%S').view( 'int64'), data.timezone) ] data = data.drop(['timezone'], axis=1) if time_from == None: time_from = min(data.time) else: time_from = int(calendar.timegm(time_from.timetuple())) if time_to == None: time_to = max(data.time) else: time_to = int(calendar.timegm(time_to.timetuple())) node_info = {} edge_info = {} dag = pp.DAG() for idx, row in data.iterrows(): if (row.time >= time_from) and (row.time <= time_to): dag.add_edge(row.pre_commit, row.post_commit) dag.topsort() assert dag.is_acyclic is True return dag, node_info, edge_info
def test_dag_init(edge_list): dag = pp.DAG(edges=edge_list) print(dag)