def transform_data(input): repo = Repository(get_git_dir()) commits = { commit.id.hex: commit for commit in repo.walk(repo.branches.get("master").target) } output = [] for row in input["results"]: if row["command"].endswith("[ERROR]"): continue dir, commit_hash = re.match("^([^ ]+/)?([0-9a-f]+)", row["command"]).groups() try: commit = commits[commit_hash] except KeyError: print(f"Skipping commit {commit_hash}", file=sys.stderr) continue for time in row["times"]: output_row = row.copy() output_row["commit"] = f"{commit.commit_time}-{commit_hash}" output_row["message"] = commit.message del output_row["times"] output_row["time"] = time output.append(output_row) return output
class GitAccessor(ScmAccessor): def __init__(self, repo_path, start_rev=None): super().__init__(repo_path=repo_path, start_rev=start_rev) self._scm = Repository(path.join(repo_path)) def get_log(self): data = [] for c in self._scm.walk(self._scm.head.target, GIT_SORT_TIME): print("Processing commit %s" % c.id) diff = self._scm.diff(c, c.parents[0]).stats.format( GIT_DIFF_STATS_FULL, 1) if c.parents else "" diff = diff.splitlines() if len(diff) >= 1: diff = diff[:-1] stripped_diff = [d.split("|")[0].strip() for d in diff] e = LogEntry() e.id = c.id e.msg = c.message.strip("\n") e.author = c.committer.name e.email = c.committer.email e.time = datetime.fromtimestamp(c.commit_time) e.diff = stripped_diff data.append(e) if self._start_rev and c.id.hex == self._start_rev: break return data
def try_commit_and_push(name, version, bundle_version): repo = Repository("headers/.git") new_commit_message = name + " " + version + " (" + bundle_version + ")" # Already commited this version? for commit in repo.walk(repo.head.target, GIT_SORT_TIME | GIT_SORT_REVERSE): if commit.message == new_commit_message: return False index = repo.index index.add_all() index.write() # print(index.diff_to_workdir().stats.files_changed) # if index.diff_to_workdir().stats.files_changed == 0: # return False print("Commiting...") user = repo.default_signature tree = index.write_tree() ref = "refs/heads/master" repo.create_commit(ref, user, user, new_commit_message, tree, [repo.head.get_object().hex]) push(repo, ref) return True
def get_bug_commit_ratio_per_file(git_folder = ".git/", output_file): result = [] exec_dir = os.getcwd() repo = Repository(os.path.join(git_repo, git_folder)) os.chdir(git_repo) for commit in repo.walk(repo.head.target): touched_files = get_touched_files(commit) for file in touched_files: file_data = [f for f in result if f['file_name'] == file] if file_data: file_data = file_data[0] file_data['commit_num'] += 1 if bug_related: file_data['bug_commit_num'] += 1 else: result.append({'file_name': file, 'commit_num': 1, 'bug_commit_num': 1 if bug_related else 0}) os.chdir(exec_dir) for entry in result: entry['bug_commit_ratio'] = entry['bug_commit_num'] / entry['commit_num'] with open(output_file, "w", newline='') as output: writer = csv.DictWriter(output, csv_header) writer.writeheader() writer.writerows(result)
def git_is_clean(srcdir, project): repo = Repository(os.path.join(srcdir, project.workspace_path, ".git")) for _, b in iteritems(repo.status()): if b != GIT_STATUS_IGNORED and b != GIT_STATUS_CURRENT: return False, "has uncommitted changes" if repo.head_is_detached: return False, "has detached HEAD" origin = get_origin(repo, project) if not origin: return False, "has no upstream remote" remote_refs = [] local_refs = {} for refname in repo.listall_references(): if refname.startswith("refs/remotes/%s/" % origin.name): ref = repo.lookup_reference(refname) if ref.type == GIT_REF_OID: remote_refs.append(ref.target) elif not refname.startswith("refs/remotes/"): ref = repo.lookup_reference(refname) if ref.type == GIT_REF_OID: local_refs[ref.peel().id] = refname if not remote_refs: return False, "has no upstream remote branches" if not local_refs: return False, "has no local branches" if not repo.lookup_branch("%s/%s" % (origin.name, project.master_branch), GIT_BRANCH_REMOTE): return False, "has no upstream master branch" for remote_ref in remote_refs: for commit in repo.walk(remote_ref): if commit.id in local_refs: del local_refs[commit.id] if local_refs: return False, "has local commits: %s" % ", ".join(["'%s'" % name for _, name in iteritems(local_refs)]) return True, ""
def extract_commits(repos_root, output_path): # Uncomment code to generate a separate file for each commit. try: os.makedirs(output_path) except FileExistsError as ex: pass exec_dir = os.getcwd() for git_repo in get_immediate_subdirectories(repos_root): os.chdir(git_repo) repo = Repository(os.path.join(git_repo, git_folder)) root = etree.Element("commits") repo_name = os.path.basename(os.path.normpath(git_repo)) print("\n> project: " + repo_name + " extraction started") for commit in repo.walk(repo.head.target): stats = get_commit_stats(commit.id) commit_xml = commit_to_xml(commit, stats) root.append(commit_xml) # print(".", end=" ") print("> project: " + repo_name + ", commit " + str(commit.id) + " processed") output_xml = xml_to_string(root) os.chdir(exec_dir) with open(os.path.join(output_path, repo_name + "_" + output_commit_file), "w") as file: file.write(output_xml) print("\n> project: " + repo_name + " extraction finished")
def _plot_chord_diagram_for_raw_bugs(project_name: str, project_repo: pygit2.Repository, bug_set: tp.FrozenSet[PygitBug], szz_tool: str) -> gob.FigureWidget: """Creates a chord diagram representing relations between introducing/fixing commits for a given set of RawBugs.""" # maps commit hex -> node id map_commit_to_id: tp.Dict[pygit2.Commit, int] = _map_commits_to_nodes(project_repo) commit_type: tp.Dict[pygit2.Commit, NodeType] = {} commit_count = len(map_commit_to_id.keys()) edge_colors = ['#d4daff', '#84a9dd', '#5588c8', '#6d8acf'] for commit in project_repo.walk(project_repo.head.target.id, pygit2.GIT_SORT_TIME): commit_type[commit] = NodeType.DEFAULT # if less than 2 commits, no graph can be drawn! if commit_count < 2: raise PlotDataEmpty commit_coordinates = _compute_node_placement(commit_count) # draw relations and preprocess commit types lines = _generate_line_data(bug_set, commit_coordinates, map_commit_to_id, commit_type, edge_colors) nodes = _generate_node_data(project_repo, commit_coordinates, map_commit_to_id, commit_type) data = nodes + lines layout = _create_layout(f'{szz_tool} {project_name}') return gob.FigureWidget(data=data, layout=layout)
def classify_by_date( path: str, start: Optional[str] = None, end: Optional[str] = None, model: Optional[MLModel] = None ) -> List[str]: """Classify commits by date.""" start_time = 0 end_time = sys.maxsize if start is not None: start_time = int(time.mktime(datetime.datetime.strptime(start, "%Y-%m-%d").timetuple())) if end is not None: end_time = int(time.mktime(datetime.datetime.strptime(end, "%Y-%m-%d").timetuple())) repo_path = os.path.join(path, ".git") if os.path.exists(repo_path): repo = Repository(repo_path) else: raise RepositoryNotFoundException orig_messages = [] for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL): if start_time < commit.commit_time < end_time: orig_messages.append(commit.message.lower()) return classify_messages(orig_messages, model)
def getCommitList(obj, startJd=None, endJd=None, branch="") -> List[Tuple[int, str]]: """ returns a list of (epoch, commit_id) tuples this function is optimized for recent commits i.e. endJd is either None or recent """ if not branch: branch = "main" startEpoch = None endEpoch = None if startJd is not None: startEpoch = getEpochFromJd(startJd) if endJd is not None: endEpoch = getEpochFromJd(endJd) repo = Repository(obj.vcsDir) data = [] # type: List[Tuple[int, str]] # items of data are (epochTime, commitHash) target = repo.branches[branch].target for commit in repo.walk(target, GIT_SORT_TIME): tm = commit.author.time if endEpoch is not None and tm > endEpoch: continue if startEpoch is not None and tm < startEpoch: break data.append(( tm, commit.id.hex, )) data.reverse() return data
def _bug_data_diff_plot(project_name: str, project_repo: pygit2.Repository, bugs_left: tp.FrozenSet[PygitBug], bugs_right: tp.FrozenSet[PygitBug]) -> gob.Figure: """Creates a chord diagram representing the diff between two sets of bugs as relation between introducing/fixing commits.""" commits_to_nodes_map = _map_commits_to_nodes(project_repo) commit_occurrences: tp.Dict[pygit2.Commit, DiffOccurrence] = {} commit_count = len(commits_to_nodes_map.keys()) commit_coordinates = _compute_node_placement(commit_count) for commit in project_repo.walk(project_repo.head.target.hex, pygit2.GIT_SORT_TIME): commit_occurrences[commit] = DiffOccurrence.NONE lines: tp.List[gob.Scatter] = _generate_diff_line_data( _diff_raw_bugs(bugs_left, bugs_right), commits_to_nodes_map, commit_coordinates, commit_occurrences) commit_types = { commit: __DIFF_TO_NODE_TYPE[do] for commit, do in commit_occurrences.items() } nodes: tp.List[gob.Scatter] = _generate_node_data(project_repo, commit_coordinates, commits_to_nodes_map, commit_types) data = lines + nodes layout = _create_layout(f'szz_diff {project_name}') return gob.Figure(data=data, layout=layout)
def _generate_node_data( project_repo: pygit2.Repository, commit_coordinates: tp.List[npt.NDArray[np.float64]], map_commit_to_id: tp.Dict[str, int], commit_type: tp.Dict[str, NodeType]) -> tp.List[gob.Scatter]: nodes = [] for commit in project_repo.walk(project_repo.head.target.id, pygit2.GIT_SORT_TIME): # draw commit nodes using preprocessed commit types commit_id = map_commit_to_id[commit] if commit.id == project_repo.head.target.id: commit_type[commit] = NodeType.FIXING_HEAD if commit_type[ commit] == NodeType.FIX else NodeType.HEAD # set node data according to commit type node_size = 10 if commit_type[commit] == NodeType.HEAD or commit_type[ commit] == NodeType.FIXING_HEAD else 8 displayed_message = commit.message.partition('\n')[0] node_label = f'Type: {commit_type[commit.hex]}<br>' \ f'Hash: {commit.hex}<br>' \ f'Author: {commit.author.name}<br>' \ f'Date: {datetime.fromtimestamp(commit.commit_time)}<br>' \ f'Message: {displayed_message}' node_color = commit_type[commit].color node_scatter = _create_node(commit_coordinates[commit_id], node_color, node_size, node_label) nodes.append(node_scatter) return nodes
def report(): max_name_length = 0 max_email_length = 0 authors = dict() repo = Repository('%s/.git' % find_toplevel(os.getcwd())) for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE): if commit.author.email not in authors.keys(): authors[commit.author.email] = dict() authors[commit.author.email]['author'] = commit.author authors[commit.author.email]['commits'] = 1 if len(commit.author.name) > max_name_length: max_name_length = len(commit.author.name) if len(commit.author.email) > max_email_length: max_email_length = len(commit.author.email) else: authors[commit.author.email]['commits'] += 1 print( 'Name'.ljust(max_name_length), '\t', 'Email'.ljust(max_email_length), '\t', 'Commits' ) print( '_' * max_name_length, '\t', '_' * max_email_length, '\t', '_' * 7 ) for email, author in authors.items(): print( author['author'].name.ljust(max_name_length), '\t', author['author'].email.ljust(max_email_length), '\t', author['commits'] )
def classify_by_tag( path: str, start_tag: str, end_tag: Optional[str] = None, model: Optional[MLModel] = None ) -> List[str]: """Classify messages for the given repo based on tags.""" repo_path = os.path.join(path, ".git") if os.path.exists(repo_path): repo = Repository(repo_path) else: raise RepositoryNotFoundException start_tag = repo.revparse_single("refs/tags/" + start_tag) if end_tag is None: end_tag = repo.revparse_single("refs/heads/master") else: end_tag = repo.revparse_single("refs/tags/" + end_tag) orig_messages = [] walker = repo.walk(end_tag.id, GIT_SORT_TOPOLOGICAL) walker.hide(start_tag.id) for commit in walker: orig_messages.append(commit.message.lower()) return classify_messages(orig_messages, model)
def get_commit_activity(self, project): from datetime import date, timedelta from pygit2 import Repository from pygit2 import GIT_SORT_TIME repo = Repository(project.gitrepo) weeks = self.get_weeks() for commit in repo.walk(repo.head.oid, GIT_SORT_TIME): commit_time = date.fromtimestamp(commit.commit_time) commit_week = commit_time - timedelta(days=commit_time.weekday()) if commit_week not in weeks: continue weeks[commit_week]['mine'] += 1 counts = [] max = 0 for k in sorted(weeks.iterkeys()): counts.append({ "week": k.isoformat(), "mine": weeks[k]['mine'], "others": weeks[k]['others'], }) return counts
def get_git_info(git_working_tree_dir): repository_path = discover_repository(git_working_tree_dir) assert repository_path is not None repo = Repository(repository_path) commits = list(repo.walk(repo.head.target, GIT_SORT_NONE)) head_commit = commits[0] diff = repo.diff() git_info = { 'head_commit': { 'hash': head_commit.hex, 'message': head_commit.message, 'author': head_commit.author.name }, 'branch': { 'name': repo.head.shorthand }, 'stats': { 'files_changed': diff.stats.files_changed, }, 'num_commits': len(commits) } return git_info
class RepositoryProcessor(object): def __init__(self, repository_path): self.repo = GitRepository(repository_path + '/.git') self.users = {} def get_bages_processors_for_user(self, email): if email in self.users: return self.users[email] self.users[email] = [] for badge_class in initialize_badge_classes(): logging.info(u'Initializing badge class [%s] for user [%s]' % (str(badge_class), email)) self.users[email].append(badge_class(email)) return self.users[email] def process(self): # returns the json of the collaborators for commit in [c for c in self.repo.walk(self.repo.head.oid, GIT_SORT_TIME)][::-1]: for badge in self.get_bages_processors_for_user(commit.author.email): badge.process_commit(commit, datetime.fromtimestamp(commit.commit_time)) result = [] for user_email, badges in self.users.items(): user = {"email": user_email, "badges": []} result.append(user) for badge in badges: if isinstance(badge, Badge): if badge.award_this(): user['badges'].append({"badge_slug": badge.slug}) else: user.update(badge.update_data()) user.update(count_modifications_by_user(user_email, self.repo.path)) print user return result
def get_labels(repo_path, branch, pair_file, last_commit): """ Get the labels from a file produced by the SZZ algorithm. It contains bug fixing commits and their respective bug fixing commit. """ repo = Repository(repo_path) head = repo.references.get(branch) commits = [] for commit in list( repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE)): commits.append(commit) if commit.hex == last_commit: break commits = list(reversed(commits)) pairs = {} with open(pair_file, 'r') as inp: pairs = json.load(inp) unique_pairs = set([p[1] for p in pairs]) labels = [] for commit in tqdm(commits): label = [commit.hex, "1" if commit.hex in unique_pairs else "0"] labels.append(label) return labels
def save_label_distribution(repo_path, branch, labels, res_path): """ Save a distribution of the labels over time. """ ldict = set() for label in labels: if label[1] == "1": ldict.add(label[0]) repo = Repository(repo_path) head = repo.references.get(branch) commits = list(repo.walk(head.target, GIT_SORT_TOPOLOGICAL)) start_year = dat.fromtimestamp(commits[-1].commit_time).year end_year = dat.fromtimestamp(commits[0].commit_time).year num_years = end_year - start_year year_dist = [0 for y in range(num_years + 1)] years = [y for y in range(start_year, end_year + 1)] for commit in commits: if commit.hex in ldict: commit_year = dat.fromtimestamp(commit.commit_time).year year_dist[commit_year - start_year - 1] += 1 fig = plt.figure() plt.bar(years, year_dist) plt.xticks(years) plt.xlim(xmin=years[0] - 1, xmax=years[-1] + 1) fig.autofmt_xdate() plt.savefig(res_path)
def parse_diffusion_features(pid, repo_path, branch, start, stop=-1): """ Function to extract diffusion features from a set of commits. """ repo = Repository(repo_path) head = repo.references.get(branch) commits = list( repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE)) start = start - 1 if (start > 0) else start commits = commits[start:stop] if (stop != -1) else commits[start:] features = [[] for c in range(len(commits))] for i, commit in enumerate(tqdm(commits[1:], position=pid)): diff = repo.diff(commits[i], commit) patches = [p for p in diff] # Extract all different subsystems that have been modified modules = set([]) subsystems_mapping = {} entropy_change = 0 file_changes = [] total_change = 0 for patch in patches: # Skip binary files if patch.delta.is_binary: continue _, addition, deletions = patch.line_stats total_change = total_change + (addition + deletions) file_changes.append(addition + deletions) # Store all subsystems fpath = patch.delta.new_file.path subsystems = fpath.split('/')[:-1] root = subsystems_mapping for system in subsystems: if system not in root: root[system] = {} root = root[system] if subsystems > 0: modules.add(subsystems[0]) # Check how many subsystems that have been touched modified_systems = count_diffing_subsystems(subsystems_mapping) # Calculate the entropy for the commit entropy_change = count_entropy(file_changes, total_change) # Add all features features[i].append(str(commit.hex)) features[i].append(str(float(modified_systems))) features[i].append(str(float(len(modules)))) features[i].append(str(float(entropy_change))) RES[pid] = features
def getHist(repo): base = Repository(repo) base.checkout('HEAD') history = [] for commit in base.walk(base.head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE): history.append(commit) return history
def get_and_update_repo_cache(repo_path): cache_filename = '%s-stats.cache' % repo_path if os.path.exists(cache_filename): with open(cache_filename) as f: data = load(f) else: data = { 'author_to_month_to_additions': defaultdict(defaultdict_int), 'author_to_month_to_deletions': defaultdict(defaultdict_int), 'author_to_month_to_commits': defaultdict(defaultdict_int), 'day_to_count': defaultdict(defaultdict_int), 'latest_sha': None, } repo = Repository(repo_path) count = 0 for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL): count += 1 if commit.type == GIT_OBJ_COMMIT: if data['latest_sha'] == commit.hex: break if not commit.message.lower().startswith('merge'): try: d = repo.diff('%s^' % commit.hex, commit) except KeyError: # First commit! break patches = list(d) additions = sum([p.additions for p in patches]) deletions = sum([p.deletions for p in patches]) author = author_aliases.get(commit.author.email, commit.author.email) day = date.fromtimestamp(commit.commit_time) data['day_to_count']['Lines'][day] += additions data['day_to_count']['Lines'][day] -= deletions if additions > 1000 and deletions < 5 and commit.hex not in whitelist_commits: if commit.hex not in blacklist_commits: print 'WARNING: ignored %s looks like an embedding of a lib (message: %s)' % (commit.hex, commit.message) continue if (additions > 3000 or deletions > 3000) and commit.hex not in whitelist_commits: if commit.hex not in blacklist_commits and additions != deletions: # Guess that if additions == deletions it's a big rename of files print 'WARNING: ignored %s because it is bigger than 3k lines. Put this commit in the whitelist or the blacklist (message: %s)' % (commit.hex, commit.message) continue month = date(day.year, day.month, 1) data['author_to_month_to_additions'][author][month] += additions data['author_to_month_to_deletions'][author][month] += deletions data['author_to_month_to_commits'][author][month] += 1 if data['latest_sha'] is None: data['latest_sha'] = commit.hex with open(cache_filename, 'w') as f: dump(data, f) return data
def save_history_features_graph(repo_path, branch, graph_path): """ Track the number of developers that have worked in a repository and save the results in a graph which could be used for later use. """ repo = Repository(repo_path) head = repo.references.get(branch) commits = list( repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE)) current_commit = repo.head.target start_time = time.time() all_files = {} current_commit = repo.get(str(current_commit)) files = get_files_in_tree(current_commit.tree, repo) for (_, name) in tqdm(files): all_files[name] = {} all_files[name]['lastcommit'] = current_commit.hex all_files[name][current_commit.hex] = {} all_files[name][current_commit.hex]["prevcommit"] = "" all_files[name][current_commit.hex]["authors"] = [ current_commit.committer.name ] for i, commit in enumerate(tqdm(commits[1:])): files = get_diffing_files(commit, commits[i], repo) for (_, name, _) in files: if name not in all_files: all_files[name] = {} last_commit = "" if 'lastcommit' not in all_files[name]: all_files[name]['lastcommit'] = commit.hex else: last_commit = all_files[name]['lastcommit'] all_files[name][commit.hex] = {} all_files[name][commit.hex]["prevcommit"] = last_commit authors = set([commit.committer.name]) if last_commit: authors.update(all_files[name][last_commit]["authors"]) all_files[name][commit.hex]["authors"] = authors all_files[name]['lastcommit'] = commit.hex with open(graph_path, 'w') as output: json.dump(all_files, output, default=set_to_list) end_time = time.time() print("Done") print("Overall processing time {}".format(end_time - start_time))
def get_commits(): global commit_list global work_list global commit_complexities repo = Repository(REPO_PATH) for commit in repo.walk(repo.head.target): commit_list.append(str(commit.id)) work_list.append(str(commit.id)) commit_complexities.append(0.0)
def gets(cls, path, max_count=100, order=GIT_SORT_TIME): """gets commits from a git repository. :param path: The normalized path to the git repository. :param max_count: max count of commits. :param order: order commits list.""" repo = Repository(path) return [cls(c.hex, [p.hex for p in c.parents]) for c in islice(repo.walk(repo.head.target, order), max_count)]
def test_003_init_in_branch(data_dir: pathlib.Path, root_repo: pygit2.Repository) -> None: dev_branch = root_repo.branches.local.create( name='dev', commit=next(root_repo.walk(root_repo.head.target))) root_repo.checkout(refname=dev_branch, strategy=pygit2.GIT_CHECKOUT_FORCE) core.init() assert config.Config.load( path=pathlib.Path('wok.yml')) == config.Config.load(path=data_dir / '003_wok.yml')
def gets(cls, path, max_count=100, order=GIT_SORT_TIME): """gets commits from a git repository. :param path: The normalized path to the git repository. :param max_count: max count of commits. :param order: order commits list.""" repo = Repository(path) return [ cls(c.hex, [p.hex for p in c.parents]) for c in islice(repo.walk(repo.head.target, order), max_count) ]
def _map_commits_to_nodes( project_repo: pygit2.Repository) -> tp.Dict[pygit2.Commit, int]: """Maps commit hex -> node id.""" commits_to_nodes_map: tp.Dict[pygit2.Commit, int] = {} commit_count = 0 for commit in project_repo.walk(project_repo.head.target.hex, pygit2.GIT_SORT_TIME): # node ids are sorted by time commits_to_nodes_map[commit] = commit_count commit_count += 1 return commits_to_nodes_map
def get_time(path): """Function to find the commits done on each hour""" times = {} repo = Repository(path) for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL): time = datetime.datetime.fromtimestamp(commit.commit_time).strftime('%-H') number = times.get(time, 0) number += 1 times[time] = number for time in range(0, 24): print("%d hour has %d commits" % (time, times.get(str(time), 0)))
def generate_walkers( repo: Repository, branch_names: Iterable[str], simplify_first_parent: bool, sorting: int, ) -> Iterable[Walker]: walkers = tuple( repo.walk(repo.branches[branch_name].peel().id, sorting) for branch_name in branch_names) for walker in walkers if simplify_first_parent else tuple(): walker.simplify_first_parent() yield from walkers
def get_authors(self, repo_path): try: if self.args.verbose: Helpers().print_success("Collecting authors in ".format(repo_path)) authors_set = set() repo = Repository(repo_path) for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL): authors_set.add(Author(commit.author.name, commit.author.email)) return authors_set except Exception as e: Helpers().print_error("{}: Could not collect authors".format(repo_path)) return None
def apply_patches(repo_name, comments, patch_path): """ 将 patch 重新应用, 并额外增加之前 id 的引用信息, - 假定 repo 的位置是所在目录的兄弟目录 """ repo_realpath = os.path.join(repo_name, '.git') repo = Repository(repo_realpath) ht_comments = dict(zip(comments, comments)) base_patch = os.path.basename(patch_path) cnt = 0 prev_tree = None prev_k = None prev_message = None with io.StringIO() as fh: for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE): k = str(commit.oid) if prev_k and prev_k in ht_comments: # print(prev_message) patch_fname = os.path.join(base_patch, prev_k + ".patch") """ FIXME: how to create image files, eg. png """ fh.write("patch -p1 -i ../" + patch_fname + "\n") # 假定当前是 git 的仓库 fh.write("git add . \n") # 处理 message """ git commit -F- <<EOF Message goes here EOF """ fh.write("git commit -F- <<EOF\n") for m in prev_message.split('\n'): fh.write(m.strip() + "\n") fh.write("EOF\n") pass prev_tree = commit.tree prev_message = commit.message prev_k = k return fh.getvalue()
def main(): contrib_data_dir = sys.argv[1] git_repo = sys.argv[2] output_file = sys.argv[3] result = [] exec_dir = os.getcwd() for contrib_file in os.listdir(contrib_data_dir): release = contrib_file[len(contrib_file_prefix ):][:-len(contrib_file_extension)] contrib_file = os.path.join(contrib_data_dir, contrib_file) top_devs = {} with open(contrib_file, newline="") as csv_file: for row in csv.DictReader(csv_file): top_dev = row['top_single_dev_contribution_knowledge'].split( ":")[0] if top_dev in top_devs: top_devs[top_dev] += 1 else: top_devs[top_dev] = 1 os.chdir(git_repo) call(["git", "checkout", "tags/" + release]) os.chdir(exec_dir) for top_dev in top_devs: author_commit_count = 0 commit_count = 0 repo = Repository(os.path.join(git_repo, git_folder)) for commit in repo.walk(repo.head.target): commit_count += 1 if commit.author.name == top_dev: author_commit_count += 1 result.append({ 'release': release, 'release_commits': commit_count, 'top_dev': top_dev, 'files_owned': top_devs[top_dev], 'dev_commits': author_commit_count }) with open(output_file, 'w', newline='') as output: writer = csv.DictWriter(output, csv_header) writer.writeheader() writer.writerows(result)
def get_churn_per_commit(git_repo, output_file): touched_files = [] exec_dir = os.getcwd() repo = Repository(os.path.join(git_repo, git_folder)) os.chdir(git_repo) for commit in repo.walk(repo.head.target): touched_files = get_touched_files(commit) with open(output_file, "w", newline='') as output: writer = csv.DictWriter(output, csv_header) writer.writeheader() writer.writerows(touched_files)
def obtain_cloned_repos(settings, logger): """ Obtains information (e.g. number of commits) of the cloned repositories """ input_filename = settings.get('results-repos-output-file') with open(input_filename, newline='', encoding='utf-8') as input_file: repos = json.load(input_file) # Obtain earliest todo-issue (discard all other data) repos = map(lambda kv: (kv[0], datetime.datetime.fromisoformat( min(kv[1].get('issues'), key=lambda y: y.get('created_at'), default=None).get('created_at') # Read dates are in UTC ).replace(tzinfo=datetime.timezone.utc).timestamp() ), repos.items()) repos = dict(repos) cloned_repo_lst = [] path = settings.get("download-output-path-repo") with os.scandir(path) as it: for entry in it: if entry.is_dir(): # Iterate over repo folders (of a single author) with os.scandir(os.path.join(path, entry.name)) as it2: for repo in it2: if repo.is_dir(): repo_name = entry.name + "/" + repo.name print("Handling " + repo_name) repo_path = os.path.join(path, entry.name, repo.name) r = Repository(repo_path) earliest_todo_issue = repos.get(repo_name) total_commits = 0 pre_commits = 0 if earliest_todo_issue is not None: repos[repo_name] = [0, 0] for commit in r.walk(r.head.target, GIT_SORT_TIME | GIT_SORT_REVERSE): if commit.commit_time < earliest_todo_issue: pre_commits += 1 total_commits += 1 cloned_repo_lst.append( { "repo": repo_name, "cloned": True, "total_commits": total_commits, "earliest_todo_issue": earliest_todo_issue, "pre_earliest_issue_commits": pre_commits, }) df_cloned_repos = pd.DataFrame(cloned_repo_lst, columns=["repo", "cloned", "total_commits", "earliest_todo_issue", "pre_earliest_issue_commits"]) df_cloned_repos.to_csv(settings.get('results-clone-info-output-file'), index=False)
def authors(path): """Function to find the commits done on each authors, with their name and mail id""" info = {} repo = Repository(path) for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL): author = commit.author.name email = commit.author.email key = "{0} <{1}>".format(author, email) number = info.get(key, 0) number += 1 info[key] = number for author, number in info.items(): msg = "{0} has {1} commits".format(author, number) print(msg)
def shift(amount, repo_path): repo = Repository(repo_path) head = repo.lookup_reference('HEAD').resolve() adder = partial(add, amount=amount) changelog = dict() reference = REF_FMT.format(time=time(), pid=getpid()) for commit in repo.walk(head.oid, GIT_SORT_REVERSE | GIT_SORT_TOPOLOGICAL): newmsg, nsubs = ISSUE_RE.subn(adder, commit.message) if nsubs != 0 or any(pnt.oid in changelog for pnt in commit.parents): parents = [changelog.get(c.oid, c.oid) for c in commit.parents] new_oid = repo.create_commit(reference, commit.author, commit.committer, newmsg, commit.tree.oid, parents) changelog[commit.oid] = new_oid return changelog, reference
def walk_repository(path): # load our requested repository repo = Repository(path) # walk the repository and check which authors are there blobs = {} old_blobs = set() old_date = -1 for commit in repo.walk(repo.head.target, GIT_SORT_TIME | GIT_SORT_REVERSE): date = commit.commit_time root_tree = commit.tree these_blobs = load_blobs_for_root_tree(root_tree, repo) for blob, filename in these_blobs: if blob not in blobs: blobs[blob] = {} blobs[blob]["start"] = date blobs[blob]["start_commit"] = str(commit.id) blobs[blob]["filename"] = filename blob_diff = old_blobs - these_blobs for blob, filename in blob_diff: if blob not in blobs: print "wat" else: blobs[blob]['end'] = date blobs[blob]['end_commit'] = str(commit.id) blobs[blob]['difference'] = ( date - blobs[blob]['start']) / SECONDS_PER_DAY old_date = date old_blobs = these_blobs # this sets so that blobs on the current worktree to "it's still here" for blob in blobs: if "end" not in blobs[blob]: blobs[blob]['end'] = time.mktime( datetime.datetime.now().timetuple()) blobs[blob]['end_commit'] = str(commit.id) blobs[blob]['difference'] = ( blobs[blob]['end'] - blobs[blob]['start']) / SECONDS_PER_DAY blobs[blob]['spicy'] = True return blobs
def get_history_features(graph, repo_path, branch): """ Function that extracts the history features from a git repository. They are the total number of authors, the total age and the total number of unique changes. """ repo = Repository(repo_path) head = repo.references.get(branch) commits = list( repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE)) features = [] commit_feat = [] commit_feat.append(str(commits[0].hex)) commit_feat.append(str(1.0)) commit_feat.append(str(0.0)) commit_feat.append(str(0.0)) features.append(commit_feat) for i, commit in enumerate(tqdm(commits[1:])): files = get_diffing_files(commit, commits[i], repo) total_number_of_authors = set() total_age = [] total_unique_changes = set() for (_, name, _) in files: sub_graph = graph[name][commit.hex] total_number_of_authors.update(sub_graph['authors']) prev_commit = sub_graph['prevcommit'] if prev_commit: total_unique_changes.add(prev_commit) prev_commit_obj = repo.get(prev_commit) total_age.append(commit.commit_time - prev_commit_obj.commit_time) total_age = float(sum(total_age)) / len(total_age) if total_age else 0 commit_feat = [] commit_feat.append(str(commit.hex)) commit_feat.append(str(float(len(total_number_of_authors)))) commit_feat.append(str(float(total_age))) commit_feat.append(str(float(len(total_unique_changes)))) features.append(commit_feat) return features
def log(dir): repo = Repository(dir) last = repo[repo.head.target] c_list = [] c_item = [] c_mes, c_name, c_time = [], [], [] for commit in repo.walk(last.id, pygit2.GIT_SORT_TIME): a, b, c = commit.message, commit.committer.name, time_change( commit.author.time) c_item.append(a) c_item.append(b) c_item.append(c) c_list.append(c_item) c_item = [] return c_list
def main(): contrib_data_dir = sys.argv[1] git_repo = sys.argv[2] output_file = sys.argv[3] result = [] exec_dir = os.getcwd() for contrib_file in os.listdir(contrib_data_dir): release = contrib_file[len(contrib_file_prefix):][:-len(contrib_file_extension)] contrib_file = os.path.join(contrib_data_dir, contrib_file) top_devs = {} with open(contrib_file, newline="") as csv_file: for row in csv.DictReader(csv_file): top_dev = row['top_single_dev_contribution_knowledge'].split(":")[0] if top_dev in top_devs: top_devs[top_dev] += 1 else: top_devs[top_dev] = 1 os.chdir(git_repo) call(["git", "checkout", "tags/" + release]) os.chdir(exec_dir) for top_dev in top_devs: author_commit_count = 0 commit_count = 0 repo = Repository(os.path.join(git_repo, git_folder)) for commit in repo.walk(repo.head.target): commit_count += 1 if commit.author.name == top_dev: author_commit_count += 1 result.append({'release': release, 'release_commits': commit_count, 'top_dev': top_dev, 'files_owned': top_devs[top_dev], 'dev_commits': author_commit_count}) with open(output_file, 'w', newline='') as output: writer = csv.DictWriter(output, csv_header) writer.writeheader() writer.writerows(result)
def export_patches(repo_name, comments, target_path, path_list=[]): """ 从 repo 中读取相关的 comments , 并应用到 target_path 中, 默认为 .target - comments 已经按提交的顺序 排好 """ repo_realpath = os.path.join(repo_name, '.git') repo = Repository(repo_realpath) ht_comments = dict(zip(comments, comments)) cnt = 0 prev_tree = None prev_k = None for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE): k = str(commit.oid) if prev_k and prev_k in ht_comments: """ 临时方案 1: 将 diff 输出到以 commits 命名的临时文件夹 2. 应用 patch, 重新提交一遍 """ changes = prev_tree.diff_to_tree(commit.tree) print("commit: ", prev_k, "=====================================") if True: patch_fname = os.path.join(target_path, prev_k + ".patch") with open(patch_fname, 'w') as fh: for c in changes: c_delta = c.delta b_emit_patch = False # 与源文件相关 或 与修改过的文件相关 print(c_delta.new_file.path) if has_related_file(c_delta.new_file.path, path_list) \ or has_related_file(c_delta.old_file.path, path_list): b_emit_patch = True if b_emit_patch: fh.write(c.text) # for h in c.hunks: # # print(h.header) # pass # exit(0) # () pass prev_tree = commit.tree prev_k = k
def get_commit_churn(repo_path): exec_dir = os.getcwd() repo = Repository(os.path.join(repo_path, git_folder)) os.chdir(repo_path) debt_commits = get_debt_commits(repo_path) #commit, commit_ts, fiel_path file_churn = [] for commit in repo.walk(repo.head.target): print (commit.id) curr_commit_ts = int(str(get_unix_timestamp(str(commit.id))).replace("\n", "")) debt_commit_flag = [f for f in debt_commits if f['commit'] == str(commit.id)] #print (str(debt_commit_flag)) is_debt_commit = 0 if debt_commit_flag: is_debt_commit = 1 touched_files = get_touched_files(commit) for strFileChurn in touched_files: added_lines, deleted_lines, file_name = strFileChurn.split("\t") file_commit_flag = [f for f in debt_commits if f['file_path'] == file_name] is_file_debt = 0 if file_commit_flag: file_commit_flag = file_commit_flag[0] debt_commit_commit_ts = int(file_commit_flag['commit_ts']) if curr_commit_ts >= debt_commit_commit_ts: is_file_debt = 1 try: file_churn.append({ 'commit':str(commit.id), 'commit_ts':curr_commit_ts, 'file_name':file_name, 'added_lines':added_lines, 'deleted_lines':deleted_lines, 'commit_debt':is_debt_commit, 'file_debt':is_file_debt, }) except (AttributeError): continue; os.chdir(exec_dir) with open(output_file, "w", newline='') as output: writer = csv.DictWriter(output, csv_header) writer.writeheader() writer.writerows(file_churn)
def cross_reference_commits_with_bug_reports(): repos_root = sys.argv[1]#"/home/kevin/Desktop/eclipse-platform"# bug_reports_file = sys.argv[2]#"/home/kevin/Downloads/eclipse-bugs.csv"# with open(bug_reports_file, newline="") as csv_file: bug_reports = [{"id": bug["id"], "creation_time": datetime.strptime(bug["creation_time"], bug_date_format), "closed_time": datetime.strptime(bug["closed_time"], bug_date_format)} for bug in csv.DictReader(csv_file)] os.makedirs(output_root_path) for git_repo in get_immediate_subdirectories(repos_root): repo_name = os.path.basename(os.path.normpath(git_repo)) repo = Repository(os.path.join(git_repo, git_folder)) bug_related_commits = [commit for commit in repo.walk(repo.head.target) if is_bug_related(commit)] root = etree.Element("commits") count = 0 for bug_report in bug_reports: # This may actually hurt the detection bug_related_commits_within_bug_life = \ [c for c in bug_related_commits if bug_report['creation_time'] <= datetime.fromtimestamp(c.commit_time) <= bug_report['closed_time']] # for commit in bug_related_commits: for commit in bug_related_commits_within_bug_life: if are_related(commit, bug_report): commit_xml = commit_to_xml(commit) commit_xml.set("related_bug", bug_report["id"]) root.append(commit_xml) count += 1 print("repo: " + repo_name + ", bug: " + bug_report["id"] + " processed") # if count > 10: # break root.set("count", str(count)) output_xml = xml_to_string(root) with open(os.path.join(output_root_path, repo_name + "_" + output_commit_file), "w") as file: file.write(output_xml)
def _pygit2_commits(commit, repository): from pygit2 import Repository, GIT_SORT_TOPOLOGICAL g = Repository(repository) if '..' in commit: tail, head = commit.split('..', 2) head = head or 'HEAD' else: head = commit tail = commit + '^' walker = g.walk(g.revparse_single(head).oid, GIT_SORT_TOPOLOGICAL) try: walker.hide(g.revparse_single(tail).oid) except KeyError: pass return walker
def main(): package = sys.argv[1] repo = Repository('.git') stripv = re.compile("v(\d+\.\d+\.\d+.*)") checktag = True log = """{package} ({version}) unstable; urgency=low * {message} -- {author_name} <{author_email}> {time} """ for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL): if checktag: try: version = git("describe", "--tags", commit.id).strip() except sh.ErrorReturnCode_128: version = '0.0.0-0-g%s' % str(commit.id)[0:7] checktag = False else: version = '0.0.0-0-g%s' % str(commit.id)[0:7] stripr = stripv.search(version) if stripr is not None: version = stripr.group(1) message = commit.message.encode("ascii", errors="replace").strip() messages = [" %s" % line for line in message.split("\n")] messages[0] = messages[0].strip() message = "\n".join(messages) print log.format(**dict( package=package, version=version, message=message, author_name=commit.author.name.encode("ascii", errors="replace"), author_email=commit.author.email.encode("ascii", errors="replace"), time=datetime.datetime.fromtimestamp(commit.commit_time).strftime("%a, %d %b %Y %H:%M:%S -0000") ))
def processGitDiff(commitsNum): counter = commitsNum; repositoryName = "../git-repos/postgres" repo = Repository(repositoryName +"/"+ ".git") childCommitNumber = "" for commit in repo.walk(repo.head.target, GIT_SORT_TIME): counter-=1; if counter<0: break currentCommitNumber = commit.oid.hex if(childCommitNumber!=""): diff = repo.diff(currentCommitNumber, childCommitNumber) fileChanges = 0; for p in diff: print(p.old_file_path) #print(p.old_oid) print(p.new_file_path) #print(p.new_oid) #print(p.additions) addLines = 0; deleteLines = 0; for hunk in p.hunks: #print(hunk.old_start) #print(hunk.old_lines) #print(hunk.new_start) #print(hunk.new_lines) for line in hunk.lines: if line[0] == "+": addLines+=1; if line[0] == "-": deleteLines+=1; print("lines added" + str(addLines)); print("lines deleted" + str(deleteLines)); fileChanges+=1 print("file changed" + str(fileChanges)); childCommitNumber = commit.oid.hex;
class GitStorage(BaseStorage): _backend = None def __init__(self, context, repo_path=None): self.context = context rp = IStorageInfo(context).path try: self.repo = Repository(discover_repository(rp)) except KeyError: # discover_repository may have failed. raise PathNotFoundError('repository does not exist at path') self.checkout() # defaults to HEAD. @property def empty_root(self): return {'': '_empty_root'} def _get_empty_root(self): return self.empty_root def _get_obj(self, path, cls=None): if path == '' and self._commit is None: # special case return self._get_empty_root() if self._commit is None: raise PathNotFoundError('repository is empty') root = self._commit.tree try: breadcrumbs = [] fragments = list(reversed(path.split('/'))) node = root oid = None while fragments: fragment = fragments.pop() if not fragment == '': # no empty string entries, also skips over '//' and # leaves the final node (if directory) as the tree. oid = node[fragment].oid node = self.repo.get(oid) breadcrumbs.append(fragment) if node is None: # strange. Looks like it's either submodules only # have entry nodes or pygit2 doesn't fully support # this. Try to manually resolve the .gitmodules # file. if cls is None: # Only return this if a specific type was not # expected. submods = parse_gitmodules(self.repo.get( root[GIT_MODULE_FILE].oid).data) submod = submods.get('/'.join(breadcrumbs)) if submod: fragments.reverse() return { '': '_subrepo', 'location': submod, 'path': '/'.join(fragments), 'rev': oid.hex, } if node and (cls is None or isinstance(node, cls)): return node except KeyError: # can't find what is needed in repo, raised by pygit2 raise PathNotFoundError('path not found') # not what we were looking for. if cls == Tree: raise PathNotDirError('path not dir') elif cls == Blob: raise PathNotFileError('path not file') raise PathNotFoundError('path not found') @property def _commit(self): return self.__commit @property def rev(self): if self.__commit: return self.__commit.hex return None @property def shortrev(self): # TODO this is an interim solution. if self.rev: return self.rev[:12] def basename(self, name): return name.split('/')[-1] def checkout(self, rev=None): # None maps to the default revision. if rev is None: rev = 'HEAD' try: self.__commit = self.repo.revparse_single(rev) except KeyError: if rev == 'HEAD': # probably a new repo. self.__commit = None return raise RevisionNotFoundError('revision %s not found' % rev) # otherwise a RevisionNotFoundError should be raised. def files(self): def _files(tree, current_path=None): results = [] for node in tree: if current_path: name = '/'.join([current_path, node.name]) else: name = node.name obj = self.repo.get(node.oid) if isinstance(obj, Blob): results.append(name) elif isinstance(obj, Tree): results.extend(_files(obj, name)) return results if not self._commit: return [] results = _files(self._commit.tree) return results def file(self, path): return self._get_obj(path, Blob).data def listdir(self, path): if path: tree = self._get_obj(path, Tree) else: if self._commit is None: return [] tree = self._commit.tree return [entry.name for entry in tree] def format(self, **kw): # XXX backwards compatibility?? return kw def log(self, start, count, branch=None, shortlog=False): """ start and branch are literally the same thing. """ def _log(iterator): for pos, commit in iterator: if pos == count: raise StopIteration yield { 'author': commit.committer.name, 'email': self._commit.committer.email, 'date': self.strftime(committer_dt(commit.committer)), 'node': commit.hex, 'rev': commit.hex, 'desc': commit.message } if start is None: # assumption. start = 'HEAD' try: self.repo.revparse_single(start) except KeyError: return [] try: rev = self.repo.revparse_single(start).hex except KeyError: raise RevisionNotFoundError('revision %s not found' % start) iterator = enumerate(self.repo.walk(rev, GIT_SORT_TIME)) return list(_log(iterator)) def pathinfo(self, path): obj = self._get_obj(path) if isinstance(obj, Blob): return self.format(**{ 'type': 'file', 'basename': self.basename(path), 'size': obj.size, 'date': self.strftime(committer_dt(self._commit.committer)), }) elif isinstance(obj, dict): # special cases are represented as dict. if obj[''] == '_subrepo': return self.format(**{ 'type': 'subrepo', 'date': '', 'size': 0, 'basename': self.basename(path), # extra field. 'obj': obj, }) elif obj[''] == '_empty_root': return self.format(**{ 'type': 'folder', 'date': '', 'size': 0, 'basename': self.basename(path), }) # Assume this is a Tree. return self.format(**{ 'basename': self.basename(path), 'size': 0, 'type': 'folder', 'date': '', }) def branches(self): return tuple( (b, self.repo.lookup_branch(b).target.hex) for b in self.repo.listall_branches() ) def tags(self): return tuple( (b[10:], self.repo.lookup_reference(b).target.hex) for b in self.repo.listall_references() if b.startswith('refs/tags') )
#encoding utf8 from pygit2 import Repository from pygit2 import GIT_SORT_TOPOLOGICAL, GIT_SORT_REVERSE repo = Repository('.git') for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL| GIT_SORT_REVERSE): if commit.author.name == "ozawaseijiro" or commit.author.name == "ozawa.seijiro" or commit.author.name == "tf-s.ozawa": print commit.message
class GitLogConverter(object): def __init__(self, path): self.repo = Repository('%s/.git' % path) def get_commits(self): return self.repo.walk( self.repo.head.target, GIT_SORT_TOPOLOGICAL ) def commits_as_dicts(self): return (self.commit_to_dict(commit) for commit in self.get_commits()) def commit_to_dict(self, commit): commit_dict = { "id": str(commit.id), "type": commit.type, "author_name": commit.author.name, "author_email": commit.author.email, "author_time": commit.author.time, "author_time_offset": commit.author.offset, "committer_name": commit.committer.name, "committer_email": commit.committer.email, "committer_time": commit.committer.time, "committer_time_offset": commit.committer.offset, "message": commit.message, "message_encoding": commit.message_encoding, "patches": [], "parent_ids": [str(id) for id in commit.parent_ids], "commit_time": commit.commit_time, "commit_time_offset": commit.commit_time_offset, } patches = commit_dict["patches"] diffs = [ commit.tree.diff_to_tree(parent.tree) for parent in commit.parents ] merged_diff = None for diff in diffs: if merged_diff is None: merged_diff = diff else: merged_diff.merge(diff) if merged_diff is not None: for patch in merged_diff: patch_dict = { "old_file_path": patch.old_file_path, "new_file_path": patch.new_file_path, "is_binary": patch.is_binary, "old_id": str(patch.old_id), "new_id": str(patch.new_id), "status": patch.status, "similarity": patch.similarity, "additions": patch.additions, "deletions": patch.deletions, } patches.append(patch_dict) return commit_dict def print_commits_as_json(self, file=sys.stdout): try: for commit_dict in self.commits_as_dicts(): print(json.dumps(commit_dict), file=file, flush=True) except (BrokenPipeError, KeyboardInterrupt): pass
import json import sys from pygit2 import Repository, Oid, GIT_SORT_TOPOLOGICAL nil="0000000000000000000000000000000000000000" payload = dict(zip(('before', 'after', 'ref'), sys.stdin.read().split())) payload['created'] = True if payload['before'] == nil else False payload['deleted'] = True if payload['after'] == nil else False if not payload['created'] and not payload['deleted']: repo = Repository('.') log = repo.walk(Oid(hex=payload['after']), GIT_SORT_TOPOLOGICAL) log.hide(Oid(hex=payload['before'])) payload['commits'] = [] for commit in log: info = {} info['id'] = commit.hex info['message'] = commit.message author = {} author['name'] = commit.author.name author['email'] = commit.author.email author['timestamp'] = commit.author.time info['author'] = author committer = {} committer['name'] = commit.committer.name
shutil.make_archive(tarname, "gztar", root_dir=dirname) # run dh_make os.system("cd %s; dh_make -s -c gpl2 --createorig -y -a -e %s" % (dirname, email)) # remove extra files and copy source files os.system("cd %s; rm -rf *.ex *.EX README*" % debname) shutil.copy2("deb_control", debname+"control") shutil.copy2("deb_copyright", debname+"copyright") # make the dirs and install files os.system("echo '%s' > %sdirs" % (installdir, debname)) os.system("echo 'gnome-keyring.so %s' > %sinstall" % (installdir, debname)) # write the changelog changelog = open(debname + "changelog", "w") repo = Repository(".") for commit in repo.walk(repo.head.target, GIT_SORT_TIME): changelog.write("%s (%s) %s; urgency=low\n\n" % (basename, package_version_str, ubuntuname)) for commit_line in commit.message.split("\n"): if len(commit_line) > 0: changelog.write(" " + commit_line + "\n") changelog.write("\n") date = time.strftime("%a, %d %b %Y %X", time.gmtime(commit.commit_time)) offset = "%+0.04d" % (commit.commit_time_offset / 60 * 1000) changelog.write(" -- %s <%s> %s %s\n" % (author_name, email, date, offset)) changelog.close() # call debuild os.system("cd %s; debuild -S -sa" % dirname)
import sh import subprocess import time repo_url = 'https://github.com/octocat/Spoon-Knife.git' repo_path = 'spoon-knife' if not os.path.exists(repo_path): repo = clone_repository(repo_url, repo_path) base = Repository(repo_path + '/.git') base.checkout('HEAD') history = [] # Display Commits Newest to Oldest for commit in base.walk(base.head.target, GIT_SORT_TOPOLOGICAL): #print commit.hex #print commit.message print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(commit.commit_time)) history.append(commit.hex) #print '-----------------------------------------------------------' # Display Commits Oldest to Newest for commit in base.walk(base.head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE): pass # print commit.hex # print base.revparse_single(commit.hex).message # print commit.commit_time # print commit.commit_time_offset
repo_id = result[0][0] else: # The repo isn't in the database yet, so we add it sql = 'INSERT INTO repo (repo_id, name) VALUES (NULL, :repo_name)' c.execute(sql, {"repo_name": repo_name}) conn.commit() # Retrieve the repo_id value generated by the database for the above insert repo_id = c.lastrowid # Loop around for each branch, adding their commits to the database for branch_name in repo.listall_branches(GIT_BRANCH_REMOTE): # Starting with the oldest commit in the branch, add its commits to the database branch = repo.lookup_reference('refs/remotes/' + branch_name) for commit in repo.walk(branch.target, GIT_SORT_TIME | GIT_SORT_REVERSE): # If requested, display the commit info for debugging purposes if debug == 1: print "commit {0}".format(commit.hex) print "Author: {0} <{1}>".format(unicode(commit.author.name).encode("utf-8"), commit.author.email) print datetime.utcfromtimestamp(commit.commit_time).strftime('Date: %a %b %d %H:%M:%S %Y +0000\n') print " {0}".format(unicode(commit.message).encode("utf-8")) # Check if the commit already exists in the database. Don't add it if its already there sql = 'SELECT commit_id, hash FROM commits WHERE repo = :repo AND hash = :hash' c.execute(sql, {"repo": repo_id, "hash": commit.hex}) result = c.fetchall() if len(result) > 0: # If requested, show debugging info
class GitStorage(Storage): """ Git file storage backend. """ def __init__(self, path): """ Initialize repository. :param path: Absolute path to the existing Git repository. :type path: str """ super(GitStorage, self).__init__() self.repo = Repository(path) self.index = self.repo.index self.index.read() @classmethod def create_storage(cls, path): """ Create repository, and return GitStorage object on it :param path: Absolute path to the Git repository to create. :type path: str :returns: GitStorage """ init_repository(path, False) return cls(path) def commit(self, user, message): """ Save previous changes in a new commit. :param user: The commit author/committer. :type user: django.contrib.auth.models.User :param message: The commit message. :type message: unicode :returns: pygit2.Commit """ # Refresh index before committing index = self.repo.index index.read() # Check the status of the repository status = self.repo.status() for filename, flags in status.items(): # the file was deleted if flags in (GIT_STATUS_INDEX_DELETED, GIT_STATUS_WT_DELETED): # remove it from the tree del index[filename] # or the file was modified/added elif flags in (GIT_STATUS_INDEX_MODIFIED, GIT_STATUS_INDEX_NEW, GIT_STATUS_WT_MODIFIED, GIT_STATUS_WT_NEW): # add it to the tree index.add(filename) treeid = index.write_tree() # Now make the commit author = Signature(u'{0} {1}'.format( user.first_name, user.last_name).encode('utf-8'), user.email.encode('utf-8') ) committer = author try: parents = [self.repo.head.oid] except GitError: parents = [] commit = self.repo.create_commit( 'refs/heads/master', author, committer, message, treeid, parents ) # Write changes to disk index.write() # and refresh index. self.index.read() # Return commit object return self.repo[commit] def log(self, name=None, limit=10): """ Get history of the repository, or of a file if name is not None. :param name: File name within the repository. :type name: unicode or None :param limit: Maximal number of commits to get (default: 10), use a negative number to get all. :type limit: int :returns: list of pygit2.Commit """ commits = [] if not name: # Look for `limit` commits for commit in self.repo.walk(self.repo.head.oid, GIT_SORT_TIME): commits.append(commit) limit = limit - 1 if limit == 0: break else: # For each commits for commit in self.repo.walk(self.repo.head.oid, GIT_SORT_TIME): # Check the presence of the file in the tree if commit.parents: # If the commit has parents, check if the file is present # in the diff diff = commit.tree.diff(commit.parents[0].tree) for patch in diff: # If the filename is the patch's filename... if name.encode('utf-8') == patch.new_file_path: # ... then we can add the commit to the list # and leave the loop commits.append(commit) limit = limit - 1 break else: # But if the commit has no parents (root commit) # Simply check in its tree try: commit.tree[name] # no error raised, it means the entry exists, so add the # commit to the list commits.append(commit) limit = limit - 1 # If the file is not in the tree, then it raises a KeyError, # so, just ignore it. except KeyError: pass # If the limit is reached, leave the loop if limit == 0: break return commits def diffs(self, name=None, limit=10): """ Get diffs between commits. Return the following dict : {"diffs": [ { "msg": unicode(<commit message>), "date": datetime.fromtimestamp(<commit date>), "author": unicode(<author name>), "sha": unicode(<commit SHA>), "parent_sha": unicode(<parent commit SHA>), # optional }, # ... ]} :param name: File name within the repository. :type name: unicode or None :param limit: Maximal number of diffs to get (default: 10), use a negative number to get all. :type limit: int :returns: dict """ commits = self.log(name=name, limit=limit) diffs = {'diffs': []} # For each commit for commit in commits: # Create a JSON object containing informations about the commit diff = { 'msg': commit.message, 'date': datetime.datetime.fromtimestamp(commit.commit_time), 'author': commit.author.name, 'sha': commit.hex, } if commit.parents: diff['parent_sha'] = commit.parents[0].hex # The SHA and parent SHA will be used to get the diff via AJAX. diffs['diffs'].append(diff) return diffs def diff(self, asha, bsha, name=None): """ Get diff between two commits. :param asha: SHA of commit A. :type asha: unicode :param bsha: SHA of commit B. :type bsha: unicode :param name: File name within the repository. :type name: unicode or None :returns: unicode """ c1 = self.repo[asha] c2 = self.repo[bsha] d = c1.tree.diff(c2.tree) if name: diff = u'' # For each patch in the diff for patch in d: # Check if the patch is our file if name.encode('utf-8') == patch.new_file_path: # Format the patch for hunk in patch.hunks: p = u'\n'.join(hunk.lines) # And add the diff to the final diff diff = u'{0}{1}'.format(diff, p) return diff # For a global diff, just return the full patch else: return d.patch def search(self, pattern, exclude=None): """ Search pattern in GIT repository. :param pattern: Pattern to search. :type pattern: unicode :param exclude: Exclude some files from the search results :type exclude: regex :returns: list of tuple containing the filename and the list of matched lines. """ entries = [] self.index.read() # For each files in the index for ientry in self.index: # If the filename match the exclude_file regex, then ignore it if exclude and re.match(exclude, ientry.path.decode('utf-8')): continue # Get the associated blob blob = self.repo[ientry.oid] # Create entry entry = (ientry.path.decode('utf-8'), []) # Add matched lines to the entry for line in blob.data.decode('utf-8').splitlines(): if pattern in line: entry[1].append(line) # If the entry has no matched lines, then ignore if entry[1]: entries.append(entry) return entries def is_dir(self, name): """ Check if name refers to a directory. :param name: File name within the repository. :type name: unicode :returns: True, False """ # Check if the path exists, if not returns default value. if not self.exists(name): return False # Get the TreeEntry associated to name tentry = self.repo.head.tree[name] # Convert it to its pygit2 representation obj = tentry.to_object() # If it's a Tree, then we can return True if isinstance(obj, Tree): return True # The instance is a Blob, so it's a file, return False else: return False def mimetype(self, name): """ Get the mimetype of a file. :param name: File name within the repository. :type name: unicode :returns: str """ # If the file is a directory if self.is_dir(name): return 'inode/directory' # Or doesn't exist elif not self.exists(name): return 'unknown' # The file exists, check its mimetype else: import urllib import mimetypes url = urllib.pathname2url(name.encode('utf-8')) return mimetypes.guess_type(url)[0] or 'unknown' def walk(self): """ Walk through the repository. """ self.index.read() for entry in self.index: yield entry # Storage API def accessed_time(self, name): """ Get last accessed time of a file. :param name: File name within the repository. :type name: unicode :returns: datetime :raises: IOError """ if not self.exists(name): raise IOError(u"{0}: Not found in repository".format(name)) abspath = os.path.join(self.repo.workdir, name) stats = os.stat(abspath) return datetime.datetime.fromtimestamp(stats.st_atime) def created_time(self, name): """ Get creation time of a file. :param name: File name within the repository. :type name: unicode :returns: datetime :raises: IOError """ if not self.exists(name): raise IOError(u"{0}: Not found in repository".format(name)) abspath = os.path.join(self.repo.workdir, name) stats = os.stat(abspath) return datetime.datetime.fromtimestamp(stats.st_ctime) def modified_time(self, name): """ Get last modified time of a file. :param name: File name within the repository. :type name: unicode :returns: datetime :raises: IOError """ if not self.exists(name): raise IOError(u"{0}: Not found in repository".format(name)) abspath = os.path.join(self.repo.workdir, name) stats = os.stat(abspath) return datetime.datetime.fromtimestamp(stats.st_mtime) def size(self, name): """ Get file's size. :param name: File name within the repository. :type name: unicode :returns: int :raises: IOError """ if not self.exists(name): raise IOError(u"{0}: Not found in repository".format(name)) e = self.index[name] blob = self.repo[e.oid] return blob.size def exists(self, path): """ Check if ``path`` exists in the Git repository. :param path: Path within the repository of the file to check. :type param: unicode :returns: True if the file exists, False if the name is available for a new file. """ # If the head is orphaned (does not point to any commit), returns False # because there is nothing in the repository. if self.repo.head_is_orphaned: return False # Try getting the path via the tree try: entry = self.repo.head.tree[path] return True # If it raises a KeyError, then the path doesn't exist except KeyError: return False def listdir(self, path=None): """ Lists the contents of the specified path. :param path: Path of the directory to list (or None to list the root). :type path: unicode or None :returns: a 2-tuple of lists; the first item being directories, the second item being files. """ abspath = os.path.join(self.repo.workdir, path) if path else self.repo.workdir dirs = [] files = [] for e in os.listdir(abspath): entry_fullpath = os.path.join(abspath, e) if os.path.isdir(entry_fullpath): if e != '.git': dirs.append(e.decode('utf-8')) else: files.append(e.decode('utf-8')) return (dirs, files) def open(self, name, mode='rb'): """ Opens the file given by name. :param name: Name of the file to open. :type name: unicode :param mode: Flags for openning the file (see builtin ``open`` function). :type mode: str :returns: GitFile """ abspath = os.path.join(self.repo.workdir, name) dirname = os.path.dirname(abspath) if 'w' in mode and not os.path.exists(dirname): os.makedirs(dirname) return GitFile(open(abspath, mode)) def path(self, name): """ Return the absolute path of the file ``name`` within the repository. :param name: Name of the file within the repository. :type name: unicode :returns: str :raises: IOError """ if not self.exists(name): raise IOError(u"{0}: Not found in repository".format(name)) e = self.index[name] return os.path.join(self.repo.workdir, e.path).decode('utf-8') def save(self, name, content): """ Saves a new file using the storage system, preferably with the name specified. If there already exists a file with this name, the storage system may modify the filename as necessary to get a unique name. The actual name of the stored file will be returned. :param name: Name of the new file within the repository. :type name: unicode :param content: Content to save. :type content: django.core.files.File :returns: str """ new_name = self.get_available_name(name) abspath = os.path.join(self.repo.workdir, new_name) dirname = os.path.dirname(abspath) if not os.path.exists(dirname): os.makedirs(dirname) with open(abspath, 'wb') as f: for chunk in content.chunks(): f.write(chunk) def delete(self, name): """ Deletes the file referenced by name. :param name: Name of the file within the repository to delete :type name: unicode :raises: IOError """ if not self.exists(name): raise IOError(u"{0}: Not found in repository".format(name)) abspath = os.path.join(self.repo.workdir, name) os.remove(abspath)
repo_url = 'https://github.com/octocat/Spoon-Knife.git' urlChunks = repo_url.split('/') repo_path = urlChunks[len(urlChunks)-1].replace('.git', '').lower() #file = open(repo_path + '.csv', 'wb') #csvWriter = csv.writer(file) if not os.path.exists(repo_path): repo = clone_repository(repo_url, repo_path) base = Repository(repo_path + '/.git') base.checkout('HEAD') history = [] # Display Commits Newest to Oldest for commit in base.walk(base.head.target, GIT_SORT_TOPOLOGICAL): #print commit.hex #print commit.message #print commit.commit_time #print commit.commit_time_offset history.append(commit.hex) ''' print '-----------------------------------------------------------' # Display Commits Oldest to Newest for commit in base.walk(base.head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE): print commit.hex print base.revparse_single(commit.hex).message print commit.commit_time print commit.commit_time_offset '''
from pygit2 import Repository repo = Repository('.git') diff = repo.diff prev_commit = None consolidate = True if consolidate is True: matrix = {} else: matrix = [] for commit in repo.walk(repo.head.target): print(commit.message) if prev_commit is not None: # get the diff info diff = repo.diff(commit, prev_commit) # Get the string with changed info and split it changes = diff.patch.split('\n')[5:] try: a = changes[3].split() b = changes[4].split() except: print("last one? and I am too lasy to dump that one so here we go") Exception else:
def count_contribs(repo_str,num_months): repo = Repository((repo_str).strip()) last = repo[repo.head.target] now = date.fromtimestamp(time.time()) res_emp = list() res_vol = list() res_tot = list() res_emp_cont = list() res_vol_cont = list() res_tot_cont = list() res_vol_perc = list() res_leverage_people = list() res_leverage_patches = list() emails = list() authors = list() commits = list() for i in xrange(num_months): time_key = add_months(i, now) print time_key employee_authors = set() volunteer_authors = set() emp_contributions = 0 vol_contributions = 0 for commit in repo.walk(last.id, GIT_SORT_TIME): tpin = get_month_key(commit.commit_time) if tpin != time_key: continue; authors.append(commit.author) commits.append(commit) # All entries in authors.keys and emails should be unique at this # point. author = commit.author if (author.email in employees) or "@mozilla." in author.email: employee_authors.add(author.email) emp_contributions += 1 else: volunteer_authors.add(author.email) vol_contributions += 1 res_emp.append(len(employee_authors)) res_vol.append(len(volunteer_authors)) res_tot.append(len(employee_authors) + len(volunteer_authors)) res_emp_cont.append(emp_contributions) res_vol_cont.append(vol_contributions) res_tot_cont.append(emp_contributions + vol_contributions) if not 0 in (emp_contributions, employee_authors, vol_contributions): res_vol_perc.append(100 * vol_contributions / (emp_contributions + vol_contributions)) res_leverage_people.append(float(len(volunteer_authors)) / float(len(employee_authors))) res_leverage_patches.append(float(vol_contributions) / float(len(employee_authors))) print 'done: ' + repo_str print 'employees: ' + str(res_emp) print 'volunteers: ' + str(res_vol) filename = "HTML_OUTPUT/" + (repo_str).strip() + '.html' html= open(filename, 'w') html.write( '''<!doctype html> <html> <head> <title>Are We Everyone Yet?</title> <link href="index.css" rel="stylesheet" type="text/css"> </head> <body> <script src="./src/Chart.js"></script> <div class="section header"> <div id="banner"> Are We Everyone Yet?</div> </div> <div class="projects"><center> ''') for p in repos: html.write( '''<a href="%s.html">%s</a> ''' % (p.strip(), p.strip() )) html.write(''' </center></div> <div class="section"> <center> <p><span id="blue">Total</span>, <span id="yellow">Employees</span>, and <span id="red">Volunteer</span> contributors per Firefox release.</p><br/> <span id="slogan">Active contributors and employees per release.</span><br/> <p><canvas id="contribChart" width="800" height="400"></canvas></p><br/> <br/> <span id="slogan">Patches from volunteer contributors and employees per release.</span><br/> <p><canvas id="contribPercent" width="800" height="400"></canvas></p><br/> <span id="slogan">Leverage: Contributers Per Employee</span><br/> <p><canvas id="leveragePeopleOverall" width="800" height="400"></canvas></p> <span id="slogan">Leverage: Contributor Patches Per Employee</span><br/> <p><canvas id="leveragePatchesOverall" width="800" height="400"></canvas></p> <span id="slogan">Volunteer Commit Percentage Overall</span><br/> <p><canvas id="contribPercentOverall" width="800" height="400"></canvas></p> </center> <script> var myColor = { red : "rgba(230, 118, 39 , 1)", yellow : "rgba(255, 230, 17 , 1)", blue : "rgba(4,174,225, 1)", green : "rgba(57, 181, 17 , 1)" } document.getElementById("blue").style.color=myColor.blue document.getElementById("yellow").style.color=myColor.yellow document.getElementById("red").style.color=myColor.red var participation_data = { ''' ) html.write('versions: ' + str(list(range( 0 - num_months, 0))) + ',\n' + \ 'employees: ' + str(res_emp) + ',\n' + \ 'volunteers: ' + str(res_vol) + ',\n' + \ 'total: ' + str(res_tot) + ',\n' + \ 'employee_commits: ' + str(res_emp_cont) + ',\n' + \ 'volunteer_commits: ' + str(res_vol_cont) + ',\n' + \ 'total_commits: ' + str(res_tot_cont) + ',\n' + \ 'volunteer_commit_percentage: ' + str(res_vol_perc) + ',\n' + \ 'leverage_people: ' + str(res_leverage_people) + ',\n' + \ 'leverage_patches: ' + str(res_leverage_patches) ) html.write( ''' } // Volunteers, up first. var lineGraphData = { labels : participation_data.versions, datasets : [ { fillColor : "rgba(0,0,0,0)",// : "rgba(0,0,0,0)", //myColor.blue, strokeColor :myColor.blue, pointColor :myColor.blue, pointStrokeColor : "#fff", data : participation_data.total }, { fillColor : "rgba(0,0,0,0)",// :"rgba(0,0,0,0)", //myColor.yellow, strokeColor : myColor.yellow, pointColor : myColor.yellow, pointStrokeColor : "#fff", data : participation_data.employees }, { fillColor : "rgba(0,0,0,0)",// : "rgba(0,0,0,0)", //gmyColor.red, strokeColor : myColor.red, pointColor : myColor.red, pointStrokeColor : "#fff", data : participation_data.volunteers } ] } var lineGraphParams = { scaleOverride: true, scaleSteps: 15, scaleStepWidth: 40, scaleStepStart: 0, scaleBeginAtZero: true } var lineGraph = new Chart(document.getElementById("contribChart").getContext("2d")).Line(lineGraphData, lineGraphParams); // Next up, contributor commits. var commitGraphData = { labels : participation_data.versions, datasets : [ { fillColor : "rgba(0,0,0,0)",// : myColor.blue, strokeColor : myColor.blue, pointColor : myColor.blue, pointStrokeColor : "#fff", data : participation_data.total_commits }, { fillColor : "rgba(0,0,0,0)",// : myColor.yellow, strokeColor : myColor.yellow, pointColor : myColor.yellow, pointStrokeColor : "#fff", data : participation_data.employee_commits }, { fillColor : "rgba(0,0,0,0)",// : myColor.red, strokeColor : myColor.red, pointColor : myColor.red, pointStrokeColor : "#fff", data : participation_data.volunteer_commits } ] } var commitGraphParams = { scaleOverride: true, scaleSteps: 20 , scaleStepWidth: 400, scaleStepsStart: 0, scaleBeginAtZero: true} var commitGraph = new Chart(document.getElementById("contribPercent").getContext("2d")).Line(commitGraphData, commitGraphParams); var percentGraphData = { labels : participation_data.versions, datasets : [ { fillColor : "rgba(0,0,0,0)",// : myColor.blue, strokeColor : myColor.blue, pointColor : myColor.blue, pointStrokeColor : "#fff", data : participation_data.volunteer_commit_percentage } ] } var percentGraphParams = { scaleOverride: true, scaleSteps: 10 , scaleStepWidth: 10, scaleStepsStart: 0, scaleBeginAtZero: true} var percentGraph = new Chart(document.getElementById("contribPercentOverall").getContext("2d")).Line(percentGraphData, percentGraphParams); var leveragePeopleGraphData= { labels : participation_data.versions, datasets : [ { fillColor : "rgba(0,0,0,0)",// : myColor.blue, strokeColor : myColor.blue, pointColor : myColor.blue, pointStrokeColor : "#fff", data : participation_data.leverage_people } ] } var leveragePeopleGraphParams = { scaleOverride: true, scaleSteps: 10 , scaleStepWidth: 0.2, scaleStepsStart: 0, scaleBeginAtZero: true} var leveragePeopleGraph= new Chart(document.getElementById("leveragePeopleOverall").getContext("2d")).Line(leveragePeopleGraphData, leveragePeopleGraphParams); var leveragePatchesGraphData = { labels : participation_data.versions, datasets : [ { fillColor : "rgba(0,0,0,0)",// : myColor.blue, strokeColor : myColor.blue, pointColor : myColor.blue, pointStrokeColor : "#fff", data : participation_data.leverage_patches } ] } var leveragePatchesGraphParams = { scaleOverride: true, scaleSteps: 10 , scaleStepWidth: 2 , scaleStepsStart: 0, scaleBeginAtZero: true} var leveragePatchesGraph = new Chart(document.getElementById("leveragePatchesOverall").getContext("2d")).Line(leveragePatchesGraphData, leveragePatchesGraphParams); </script> </div> </body> </html> ''' )
return resp def get_attachement(num, attachment): url = TRAC_HTTP + "/raw-attachment/ticket/%d/%s" % (num, attachment) print url resp = requests.get(url) print resp.status_code if resp.status_code == 200: return resp.text return None repo = Repository(GIT_REPO) for commit in repo.walk(repo.head.oid, GIT_SORT_TIME): try: tickets = re.findall('#[0-9]+', commit.message) if tickets: if len(commit.parents) != 1: continue base = commit.parents[0].hex head = commit.hex title = commit.message ticket = tickets[0][1:] body = "PR for issue #%s" % (ticket) trac_ticket = trac._tracserver.ticket.get(ticket) num, updated, created, props = trac_ticket if props['status'] != 'closed': continue print commit.message
def get_and_update_repo_cache(repo_path, repo_name): cache_filename = '%s-stats.cache' % repo_name if os.path.exists(cache_filename): with open(cache_filename) as f: data = load(f) else: data = { 'author_to_month_to_additions': defaultdict(defaultdict_int), 'author_to_month_to_deletions': defaultdict(defaultdict_int), 'author_to_month_to_changes': defaultdict(defaultdict_int), 'author_to_month_to_commits': defaultdict(defaultdict_int), 'day_to_count': defaultdict(defaultdict_int), 'change_count_by_file': defaultdict(int), 'latest_sha': None, } repo = Repository(repo_path) ignored_commits = [] count = 0 for commit in repo.walk(repo.head.target, GIT_SORT_TOPOLOGICAL): count += 1 if commit.type == GIT_OBJ_COMMIT: if data['latest_sha'] == commit.hex: break try: d = repo.diff('%s^' % commit.hex, commit) except KeyError: print "Commits without parent: ", commit.hex continue additions = d.stats.insertions deletions = d.stats.deletions author = author_aliases.get(commit.author.email, commit.author.email) day = date.fromtimestamp(commit.commit_time) data['day_to_count']['Lines'][day] += additions data['day_to_count']['Lines'][day] -= deletions if additions > 1000 and deletions < 5 and commit.hex not in whitelist_commits: if commit.hex not in blacklist_commits: ignored_commits.append(commit.hex) # print 'WARNING: ignored %s looks like an embedding of a lib (message: %s)' % (commit.hex, commit.message) continue if (additions > 3000 or deletions > 3000) and commit.hex not in whitelist_commits: if commit.hex not in blacklist_commits: ignored_commits.append(commit.hex) # print 'WARNING: ignored %s because it is bigger than 3k lines. Put this commit in the whitelist or the blacklist (message: %s)' % (commit.hex, commit.message) continue month = date(day.year, day.month, 1) data['author_to_month_to_additions'][author][month] += additions data['author_to_month_to_deletions'][author][month] += deletions data['author_to_month_to_changes'][author][month] += additions + deletions data['author_to_month_to_commits'][author][month] += 1 if data['latest_sha'] is None: data['latest_sha'] = commit.hex if d.patch: for changed_path in [x for x in d.patch.split('\n') if x.startswith('+++ ') and '/dev/null' not in x]: data['change_count_by_file'][changed_path[len('+++ ') + 1:]] += 1 with open(cache_filename, 'w') as f: dump(data, f) with open(repo_name + '-ignored-commits.txt', 'w') as f: f.writelines('%s\n' % x for x in ignored_commits) return data
class prototype: repo = "" # Path to a given repository name = "" # Name of a repository base = "" # Repository as defined in pygit2 # Initialization. Clones the given repository, placing it in the current directory, # and changes to the repository directory. def init(self, repository): self.repo = repository # Use regular expressions to match the last instance of a forward slash # followed by the name of the repository, which we wish to extract, followed # by ".git". m = re.search('/([^/]+).git$', repository) if m: self.name = m.group(1) if not os.path.isdir(self.name): os.system('git clone ' + self.repo) # Get the repository from GitHub self.base = Repository(self.name) self.base.checkout('HEAD') # Destruction. Remove the given repository from memory. def destroy(self): os.system('cd ' + self.name) os.system('rm -rf ' + self.name) # Get total LOC by given repository. def totalRepoLOC(self): loc = countDirLOC(self.name) return loc # Get total commits by a given repository def totalRepoCommits(self): commits = 0 for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL): commits = commits + 1 return commits # Get a list of LOC changed per commit def locPerCommit(self): loc = [] oldPath = os.popen('pwd') os.chdir(self.name) sha1 = 0 sha2 = 0 start = 1 total = self.totalRepoCommits() # For each commit within the repository for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL): print '\r', start, '/', total, start += 1 # Based on the SHA, use git to show the patch for that commit sha1 = sha2 sha2 = commit.hex if sha1 != 0: p = os.popen('git diff --shortstat ' + sha1 + ' ' + sha2) line = p.readline() # line contains "# file changed, # insertions(+), # deletions(-) # Use regular expressions to find the number of additions and deletions. # Additions are found after ", " and before " insertion". Deletions are # found after "(+), " and before " deletion". m = re.search(', (.*) insertion', line) additions = 0 deletions = 0 if m: additions = m.group(1) m = re.search('\(\+\), (.*) deletion', line) if m: deletions = m.group(1) # Get the total and append to array modifications = int(additions) + int(deletions) loc.append(modifications) os.chdir('..') return loc # Get a list containing the total number of line additions and deletions (including # whitespace and comments) contained within each hunk that was changed over t def locPerHunk(self): loc = [] history = [] # Get the hex number for each commit within the repository for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL): sha = commit.hex history.append(sha) # Compare each revision in the history of the repository with the previous rev. i = 0 while i < len(history) - 1: t0 = self.base.revparse_single(history[i]) t1 = self.base.revparse_single(history[i+1]) diff = self.base.diff(t0,t1) patches = [p for p in diff] for patch in patches: for hunk in patch.hunks: # Check the first character in each hunk line. Only those that have # been modified will contain a '+' (insertion) or '-' (deletion) totalModifications = 0 for line in hunk.lines: if line[0] == '-' or line[0] == '+': totalModifications +=1 loc.append(totalModifications) i += 1 return loc # Get the total number of lines contained within a hunk, including additions, deletions, # and surrounding non-changed lines def locInHunk(self): loc = [] history = [] # Get the hex number for each commit within the repository for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL): sha = commit.hex history.append(sha) # Compare each revision in the history of the repository with the previous rev. i = 0 while i < len(history) - 1: t0 = self.base.revparse_single(history[i]) t1 = self.base.revparse_single(history[i+1]) diff = self.base.diff(t0,t1) patches = [p for p in diff] for patch in patches: for hunk in patch.hunks: totalLines = 0 for line in hunk.lines: totalLines += 1 loc.append(totalLines) i += 1 return loc # Perform a diff between all commits starting from oldest to newest # and compile temp files comprised of only modified lines. # Run cloc on temp files to get sloc for each diff set. def slocPerDiff(self): # Storage for commit history hashes history = [] # Store all slocs slocPerDiffs = [] # Move through the system history from newest to oldest commit for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE): history.append(commit) i = 0 while i < len(history) - 2: sloc = 0 t0 = self.base.revparse_single(history[i].hex) t1 = self.base.revparse_single(history[i+1].hex) try: diff = self.base.diff(t0,t1) except ValueError: print "Caught value error." i += 1 continue patches = [p for p in diff] for patch in patches: print patch.new_file_path hunkfile = open("tmp", 'w') for hunk in patch.hunks: totesLines = 0 totesMods = 0 for line in hunk.lines: totesLines += 1 if line[0] == '-' or line[0] == '+': totesMods += 1 hunkfile.write(line[1]) hunkfile.close() output = subprocess.Popen('cloc ' + patch.new_file_path + ' --by-file --csv', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) start = False for line in output.stdout.readlines(): if line[0] == 'l': start = True continue if start: temp = line.split(',') sloc += int(temp[4].replace('\n', '')) retval = output.wait() os.remove("tmp") i += 1 slocPerDiffs.append(int(sloc)) return slocPerDiffs # Get a list containing the number of hunks changed per commit def hunksPerCommit(self): hunks = [] history = [] start = 1 total = self.totalRepoCommits() # Get the hex number for each commit within the repository for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL): sha = commit.hex history.append(sha) # Compare each revision in the history of the repository with the previous rev. i = 0 while i < len(history) - 1: print '\r', start, '/', total, start += 1 t0 = self.base.revparse_single(history[i]) t1 = self.base.revparse_single(history[i+1]) try: diff = self.base.diff(t0,t1) except ValueError: print "Caught value error." i += 1 continue patches = [p for p in diff] for patch in patches: hunks.append(len(patch.hunks)) i += 1 return hunks # Get a list of the number of files changed per commit def filesPerCommit(self): files = [] oldPath = os.popen('pwd') os.chdir(self.name) sha1 = 0 sha2 = 0 start = 1 total = self.totalRepoCommits() # For each commit within the repository for commit in self.base.walk(self.base.head.target, GIT_SORT_TOPOLOGICAL): print '\r', start, '/', total, start += 1 # Based on the SHA, use git to show the patch for that commit sha1 = sha2 sha2 = commit.hex if sha1 != 0: p = os.popen('git diff --shortstat ' + sha1 + ' ' + sha2) line = p.readline() # line contains "# file changed, # insertions(+), # deletions(-) # Use regular expressions to find the number of files modified, which # are contained first on the line followed by " file" m = re.search(' (.*) file', line) if m: numFilesChanged = int(m.group(1)) files.append(numFilesChanged) os.chdir('..') return files # Print out all stats for the repository def printStats(self): f = open(self.name + '-results.txt', 'w') f.write(("-----------" + self.name + "-----------\n")) # Stats on entire repository repoLOC = self.totalRepoLOC() repoCommits = self.totalRepoCommits() # Lists by commit locPerCommit = self.locPerCommit() #slocPerDiff = self.slocPerDiff() hunksPerCommit = self.hunksPerCommit() filesPerCommit = self.filesPerCommit() # Stats for LOC xsmall = 0 small = 0 medium = 0 large = 0 xlarge = 0 for item in locPerCommit: if (item >= 0 and item <= 5): xsmall += 1 if (item >= 6 and item <= 46): small += 1 if (item >= 47 and item <= 106): medium += 1 if (item >= 107 and item <= 166): large += 1 if (item >= 167): xlarge += 1 f.write("Number of Modified Lines:\n") f.write("x-small: " + str(xsmall) + "\n") f.write("small: " + str(small) + "\n") f.write("medium: " + str(medium) + "\n") f.write("large: " + str(large) + "\n") f.write("x-large: " + str(xlarge) + "\n") ''' # Stats for SLOC xsmall = 0 small = 0 medium = 0 large = 0 xlarge = 0 for item in slocPerDiff: if (item >= 0 and item <= 5): xsmall += 1 if (item >= 6 and item <= 46): small += 1 if (item >= 47 and item <= 106): medium += 1 if (item >= 107 and item <= 166): large += 1 if (item >= 167): xlarge += 1 f.write("Number of Modified SLOC: \n") f.write("x-small: " + str(xsmall) + "\n") f.write("small: " + str(small) + "\n") f.write("medium: " + str(medium) + "\n") f.write("large: " + str(large) + "\n") f.write("x-large: " + str(xlarge) + "\n") ''' # Print stats for modified files xsmall = 0 small = 0 medium = 0 large = 0 xlarge = 0 for item in filesPerCommit: if (item == 1): xsmall += 1 if (item >= 2 and item <= 4): small += 1 if (item >= 5 and item <= 7): medium += 1 if (item >= 8 and item <= 10): large += 1 if (item >= 11): xlarge += 1 f.write("Number of modified files: \n") f.write("x-small: " + str(xsmall) + "\n") f.write("small: " + str(small) + "\n") f.write("medium: " + str(medium) + "\n") f.write("large: " + str(large) + "\n") f.write("x-large: " + str(xlarge) + "\n") # Prints stats for hunks xsmall = 0 small = 0 medium = 0 large = 0 xlarge = 0 for item in hunksPerCommit: if (item >= 0 and item <= 1): xsmall += 1 if (item >= 2 and item <= 8): small += 1 if (item >= 9 and item <= 17): medium += 1 if (item >= 18 and item <= 26): large += 1 if (item >= 27): xlarge += 1 f.write("Number of hunks per commit: \n") f.write("x-small: " + str(xsmall) + "\n") f.write("small: " + str(small) + "\n") f.write("medium: " + str(medium) + "\n") f.write("large: " + str(large) + "\n") f.write("x-large: " + str(xlarge) + "\n") f.close()