예제 #1
0
def get_commit_editing_paths(sqlite_db_file,
                             time_from=None,
                             time_to=None,
                             filename=None):
    """ Returns DAG of commits where an edge between commit A and B indicates that lines written in
        commit A where changes in commit B. Further outputs editing paths extracted from the DAG.

        Node and edge infos set up to be expanded with future releases.

    Args:
        sqlite_db_file: path to sqlite database
        time_from: start time of time window filter, datetime object
        time_to: end time of time window filter, datetime object
        filename: filter to obtain only commits editing a certain file

    Returns:
        paths: pathpy path object capturing editing paths
        dag: pathpy dag object linking commits
        node_info: info on node charactaristics
        edge_info: info on edge characteristics
    """

    con = sqlite3.connect(sqlite_db_file)
    data = pd.read_sql(
        """SELECT edits.original_commit_deletion AS pre_commit,
                                 edits.commit_hash AS post_commit,
                                 edits.filename,
                                 commits.author_date AS time
                          FROM edits
                          JOIN commits
                          ON edits.commit_hash = commits.hash""",
        con).drop_duplicates()
    if filename is not None:
        data = data.loc[data.filename == filename, :]

    if time_from == None:
        time_from = datetime.datetime.strptime(min(data.time),
                                               '%Y-%m-%d %H:%M:%S')
    if time_to == None:
        time_to = datetime.datetime.strptime(max(data.time),
                                             '%Y-%m-%d %H:%M:%S')

    node_info = {}
    edge_info = {}

    dag = pp.DAG()
    for idx, row in data.iterrows():
        if (datetime.datetime.strptime(row.time, '%Y-%m-%d %H:%M:%S') >= time_from) and \
           (datetime.datetime.strptime(row.time, '%Y-%m-%d %H:%M:%S') <= time_to):
            dag.add_edge(row.pre_commit, row.post_commit)

    dag.topsort()

    assert dag.is_acyclic is True

    paths = pp.path_extraction.paths_from_dag(dag)

    return paths, dag, node_info, edge_info
예제 #2
0
파일: conftest.py 프로젝트: uzhdag/pathpy
def dag_object():
    dag = pp.DAG()
    # For this DAG, the following five paths between the root and the leaves exist
    # for the following mapping:
    # mapping = {'a': 'A', 'b': 'B', 'c': 'A', 'e': 'B',
    # 'f': 'B', 'g': 'A', 'h': 'A','i': 'B', 'j': 'A' }

    #   h -> i                  ( A -> B )
    #   h -> j                  ( A -> A )
    #   a -> b -> e             ( A -> B -> B )
    #   a -> c -> g             ( A -> A -> A )
    #   a -> b -> f -> g        ( A -> B -> B -> A )
    #   a -> c -> b -> e        ( A -> A -> B -> B )
    #   a -> c -> b -> f -> g   ( A -> A -> B -> B -> A )
    dag.add_edge('a', 'b')
    dag.add_edge('a', 'c')
    dag.add_edge('c', 'b')
    dag.add_edge('b', 'e')
    dag.add_edge('b', 'f')
    dag.add_edge('f', 'g')
    dag.add_edge('c', 'g')
    dag.add_edge('h', 'i')
    dag.add_edge('h', 'j')
    return dag
예제 #3
0
def get_line_editing_paths(sqlite_db_file, commit_hashes=None, file_paths=None, with_start=False,
                           merge_renaming=True):
    """ Returns line editing DAG as well as line editing paths.

        Node and edge infos set up to be expanded with future releases.

    Args:
        sqlite_db_file: path to sqlite database mined with git2net line method
        commit_hashes: list of commits to consider, by default all commits are considered
        file_paths: list of files to consider, by defailt all files are considered
        with_start: bool, determines if node for filename is included as start for all editing pahts
        merge_renaming: bool, determines if file renaming is considered

    Returns:
        paths: line editing pahts, pathpy Path object
        dag: line editing directed acyclic graph, pathpy DAG object
        node_info: info on node charactaristics
        edge_info: info on edge characteristics
    """

    # Connect to provided database.
    con = sqlite3.connect(sqlite_db_file)

    # Check if database is valid.
    try:
        path = con.execute("SELECT repository FROM _metadata").fetchall()[0][0]
        method = con.execute("SELECT method FROM _metadata").fetchall()[0][0]
        if method == 'blocks':
            raise Exception("Invalid database. A database mined with 'use_blocks=False' is " +
                            "required.")
    except sqlite3.OperationalError:
        raise Exception("You either provided no database or a database not created with git2net. " +
                        "Please provide a valid datatabase mined with 'use_blocks=False'.")
    if merge_renaming:
        print('Searching for aliases')
        # Identify files that have been renamed.
        _, aliases = git2net.identify_file_renaming(path)

    dag = pp.DAG()
    node_info = {}
    node_info['colors'] = {}
    node_info['time'] = {}
    # node_info['file_paths'] = {}
    # node_info['edit_distance'] = {}
    edge_info = {}
    edge_info['colors'] = {}
    edge_info['weights'] = {}

    # Extract required data from the provided database.
    print('Querying commits')
    commits = pd.read_sql("""SELECT hash, author_name, author_date FROM commits""", con)
    print('Querying edits')
    edits = pd.DataFrame()
    no_of_edits = pd.read_sql("""SELECT count(*) FROM edits""", con).iloc[0, 0]
    chunksize = 1000
    for edits in tqdm(pd.read_sql("""SELECT levenshtein_dist,
                                            old_path,
                                            new_path,
                                            commit_hash,
                                            original_commit_deletion,
                                            original_commit_addition,
                                            original_line_no_deletion,
                                            original_line_no_addition,
                                            original_file_path_deletion,
                                            original_file_path_addition,
                                            post_starting_line_no,
                                            edit_type
                                      FROM edits""", con, chunksize=chunksize),
                            total = math.ceil(no_of_edits / chunksize)):



        # Filter edits table if only edits from specific commits are considered.
        if commit_hashes is not None:
            edits = edits.loc[[x in commit_hashes for x in edits.commit_hash], :]

        # Rename file paths to latest name if option is selected.
        if merge_renaming:
            # Update their name in the edits table.
            for key, value in aliases.items():
                edits.replace(key, value[0], inplace=True)

        # Filter edits table if specific files are considered. Has to be done after renaming.
        if file_paths is not None:
            edits = edits.loc[[x in file_paths for x in edits.new_path], :]

        # Get author and date of deletions.
        edits = pd.merge(edits, commits, how='left', left_on='original_commit_deletion',
                                right_on='hash').drop(['hash'], axis=1)
        edits.rename(columns = {'author_name':'author_name_deletion',
                                'author_date': 'author_date_deletion'}, inplace = True)

        # Get author and date of additions.
        edits = pd.merge(edits, commits, how='left', left_on='original_commit_addition',
                                right_on='hash').drop(['hash'], axis=1)
        edits.rename(columns = {'author_name':'author_name_addition',
                                'author_date': 'author_date_addition'}, inplace = True)

        # Get current author and date
        edits = pd.merge(edits, commits, how='left', left_on='commit_hash',
                                right_on='hash').drop(['hash'], axis=1)

        file_paths = set()

        # Sort edits by author date.
        #print('Sorting edits')
        #edits.sort_values('author_date', ascending=True, inplace=True)

        for _, edit in edits.iterrows():
            if edit.edit_type == 'replacement':
                # Generate name of target node.
                target = 'L' + str(int(edit.post_starting_line_no)) + ' ' + \
                        edit.new_path + ' ' + \
                        edit.commit_hash

                # Source of deletion must exist.
                source_deletion = 'L' + str(int(edit.original_line_no_deletion)) + ' ' + \
                                edit.original_file_path_deletion + ' ' + \
                                edit.original_commit_deletion
                dag.add_edge(source_deletion, target)
                edge_info['colors'][(source_deletion, target)] = 'white'
                edge_info['weights'][(source_deletion, target)] = edit.levenshtein_dist
                node_info['time'][target] = edit.author_date
                node_info['time'][source_deletion] = edit.author_date_deletion
                # Check id source of addition exists.
                if edit.original_commit_addition is not None:
                    source_addition = 'L' + str(int(edit.original_line_no_addition)) + ' ' + \
                                    edit.original_file_path_addition + ' ' + \
                                    edit.original_commit_addition
                    dag.add_edge(source_addition, target)
                    edge_info['colors'][(source_addition, target)] = '#FBB13C' # yellow
                    edge_info['weights'][(source_addition, target)] = edit.levenshtein_dist
                    node_info['time'][target] = edit.author_date
                    node_info['time'][source_addition] = edit.author_date_addition
            elif edit.edit_type == 'deletion':
                # An edit in a file can only change lines in that file, not in the file the line was
                # copied from.
                if edit.original_file_path_deletion == edit.old_path:
                    # Generate name of target node.
                    target = 'deleted L' + str(int(edit.original_line_no_deletion)) + ' ' + \
                            edit.original_file_path_deletion + ' ' + \
                            edit.original_commit_deletion

                    # Source of deletion must exist.
                    source_deletion = 'L' + str(int(edit.original_line_no_deletion)) + ' ' + \
                                    edit.original_file_path_deletion + ' ' + \
                                    edit.original_commit_deletion
                    dag.add_edge(source_deletion, target)
                    edge_info['colors'][(source_deletion, target)] = 'white'
                    edge_info['weights'][(source_deletion, target)] = edit.levenshtein_dist
                    node_info['time'][target] = edit.author_date
                    node_info['time'][source_deletion] = edit.author_date_deletion
                # else:
                #     print(edit)
                #     copied_from = 'L' + str(int(edit.original_line_no_deletion)) + ' ' + \
                #                     edit.original_file_path_deletion + ' ' + \
                #                     edit.original_commit_deletion

                #     copied_to = 'L' + str(int(edit.post_starting_line_no)) + ' ' + \
                #         edit.new_path + ' ' + \
                #         edit.commit_hash

                #     #found_copied_to = False
                #     #for copied_to in dag.successors[copied_from]:
                #     #    if copied_to.split(' ')[1] == edit.old_path:
                #     #        found_copied_to = True
                #     #        break
                #     #assert found_copied_to
                #     dag.add_edge(copied_to, 'deleted ' + copied_to)
                #     edge_info['colors'][(copied_to, 'deleted ' + copied_to)] = 'white'
                #     edge_info['weights'][(copied_to, 'deleted ' + copied_to)] = edit.levenshtein_dist
            elif edit.edit_type == 'addition':
                # Generate name of target node.
                target = 'L' + str(int(edit.post_starting_line_no)) + ' ' + \
                        edit.new_path + ' ' + \
                        edit.commit_hash

                # Add file path as source and add file path to file_paths list.
                source = edit.new_path
                file_paths.add(source)
                dag.add_edge(source, target)
                edge_info['colors'][(source, target)] = 'gray'
                edge_info['weights'][(source, target)] = edit.levenshtein_dist
                node_info['time'][target] = edit.author_date

                # Check id source of addition exists.
                if edit.original_commit_addition is not None:
                    source_addition = 'L' + str(int(edit.original_line_no_addition)) + ' ' + \
                                    edit.original_file_path_addition + ' ' + \
                                    edit.original_commit_addition
                    dag.add_edge(source_addition, target)
                    edge_info['colors'][(source_addition, target)] = '#FBB13C'
                    edge_info['weights'][(source_addition, target)] = edit.levenshtein_dist
                    node_info['time'][target] = edit.author_date
                    node_info['time'][source_addition] = edit.author_date_addition
            elif edit.edit_type == 'file_renaming':
                pass
            else:
                raise Exception("Unexpected error in 'extract_editing_paths'.")

    for node in tqdm(dag.nodes):
        if node in file_paths:
            node_info['colors'][node] = 'gray'
        else:
            if '#FBB13C' in [edge_info['colors'][n] for n in [(x, node)
                                                    for x in dag.predecessors[node]]]:
                node_info['colors'][node] = '#FBB13C' # yellow
            elif node.startswith('deleted'):
                node_info['colors'][node] = '#A8322D' # red
            elif 'white' not in [edge_info['colors'][n] for n in [(node, x)
                                                        for x in dag.successors[node]]]:
                node_info['colors'][node] = '#2E5EAA' # blue
            elif not dag.predecessors[node].isdisjoint(file_paths):
                node_info['colors'][node] = '#218380' # green
            else:
                node_info['colors'][node] = '#73D2DE' # light blue

    if not with_start:
        for file_path in file_paths:
            dag.remove_node(file_path)

    dag.topsort()

    assert dag.is_acyclic is True

    paths = pp.path_extraction.paths_from_dag(dag)

    return paths, dag, node_info, edge_info
예제 #4
0
def get_commit_editing_dag(sqlite_db_file,
                           time_from=None,
                           time_to=None,
                           filename=None):
    """
    Returns DAG of commits where an edge between commit A and B indicates that lines written in
    commit A were changed in commit B. Further outputs editing paths extracted from the DAG.

    :param str sqlite_db_file: path to SQLite database
    :param datetime.datetime time_from: start time of time window filter, datetime object
    :param datetime.datetime time_to: end time of time window filter, datetime object
    :param str filename: filter to obtain only commits editing a certain file

    :return:
        - *pathpy.DAG* – commit editing dag
        - *dict* – info on node charactaristics
        - *dict* – info on edge characteristics
    """

    con = sqlite3.connect(sqlite_db_file)
    data = pd.read_sql(
        """SELECT edits.original_commit_deletion AS pre_commit,
                                 edits.commit_hash AS post_commit,
                                 edits.filename,
                                 commits.author_date AS time,
                                 commits.author_timezone as timezone
                          FROM edits
                          JOIN commits
                          ON edits.commit_hash = commits.hash""",
        con).drop_duplicates()

    if filename is not None:
        data = data.loc[data.filename == filename, :]

    data['time'] = [
        int(t / (10**9) - tz) for t, tz in zip(
            pd.to_datetime(data.time, format='%Y-%m-%d %H:%M:%S').view(
                'int64'), data.timezone)
    ]

    data = data.drop(['timezone'], axis=1)

    if time_from == None:
        time_from = min(data.time)
    else:
        time_from = int(calendar.timegm(time_from.timetuple()))
    if time_to == None:
        time_to = max(data.time)
    else:
        time_to = int(calendar.timegm(time_to.timetuple()))

    node_info = {}
    edge_info = {}

    dag = pp.DAG()
    for idx, row in data.iterrows():
        if (row.time >= time_from) and (row.time <= time_to):
            dag.add_edge(row.pre_commit, row.post_commit)

    dag.topsort()

    assert dag.is_acyclic is True

    return dag, node_info, edge_info
예제 #5
0
def test_dag_init(edge_list):
    dag = pp.DAG(edges=edge_list)
    print(dag)