def commit_tree_to_frame(commit): tree_dicts = [] _add_subtree(tree_dicts, commit.tree, '.') tree_frame = pd.DataFrame(tree_dicts) tree_frame['hexsha'] = commit.hexsha tree_frame['child_type'] = su.categorize(tree_frame['child_type']) return tree_frame
def commits_to_frame(commits): commit_attrs = ( 'hexsha', 'name_rev', 'size', 'author.name', 'author.email', 'authored_datetime', 'author_tz_offset', 'committer.name', 'committer.email', 'committed_datetime', 'committer_tz_offset', 'encoding', 'message', 'stats.total.files', 'stats.total.lines', 'stats.total.insertions', 'stats.total.deletions', 'stats.files') column_names = {attr: attr.replace('.', '_') for attr in commit_attrs} commit_frame = su.dicts_to_dataframe(list( su.objs_to_dicts(commits, commit_attrs))) commit_frame.rename(columns=column_names, inplace=True) commit_frame['name_rev'] = commit_frame['name_rev'].str.split( ' ', 1).apply(lambda x: x[-1]) categorical_cols = ( 'name_rev', 'author_name', 'author_email', 'committer_name', 'committer_email', 'encoding') for c in categorical_cols: commit_frame[c] = su.categorize(commit_frame[c]) for c in ('authored_datetime', 'committed_datetime'): commit_frame[c] = commit_frame[c].astype('datetime64[ns]') commit_frame['message'] = commit_frame['message'].str.replace('\n', '\\n') commit_frame = commit_frame.sort_values( 'committed_datetime', ascending=False).reset_index(drop=True) return commit_frame
def commit_trees_to_frame(commits): frame: pd.DataFrame = pd.concat( (commit_tree_to_frame(c) for c in commits)) cat_columns = ('hexsha', 'tree', 'child', 'child_type') for col in cat_columns: frame[col] = su.categorize(frame[col]) frame.reset_index(inplace=True, drop=True) return frame
def commits_to_actor_frame(commits): attrs = ('name', 'email') authors = extract_actors(commits, 'author', attrs) committers = extract_actors(commits, 'committer', attrs) actors = pd.merge(authors, committers, on=attrs, how='outer') actors = actors.drop_duplicates().reset_index(drop=True).fillna(0) for attr in attrs: actors[attr] = su.categorize(actors[attr]) for col_name in ('author_commits', 'committer_commits'): actors[col_name] = actors[col_name].astype('int') return actors
def connect_actors(actor_frame, connectivity_sets, connectivity_column): """ :param actor_frame: :param connectivity_sets: :param connectivity_column: :return: Examples: same_actors = { 'ccason': [3, 14, 15], 'clipka': [4, 5, 13], 'wfpokorny': [11, 17], 'anshuarya': [0], 'bentsm': [1], 'cbarton': [2], 'dbodor': [6], 'jlecher': [7], 'jgrimbert': [8], 'nalvarez': [9], 'selvik': [10], 'wverhelst': [12], 'gryken': [16], 'github': [18]} actor_frame = connect_actors(actor_frame, same_actors, 'actor_id') """ connectivity = {} for actor_id, connectivity_set in connectivity_sets.items(): for actor in connectivity_set: connectivity[actor] = actor_id actor_frame[connectivity_column] = su.categorize(pd.Series(connectivity)) return actor_frame