def get_author(repo: git.Repository, commit: git.Oid): if config.IS_CONCOURSE: return repo.get(commit).author.email # use git user email as release's author, or failing that, fall back to this commit's author emails = list(repo.config.get_multivar("user.email")) if emails: return emails[0] LOG.warning("Unable to determine author from repo config. " "Falling back to author of most recent commit.") return repo.get(commit).author.email
def update_tree( repo: git.Repository, tree: git.Oid, path: List[str], content: str ) -> git.Oid: """ adds a blob with `content` at `path` to `tree` in `repo` >>> repo = create_repo() >>> tree = repo.TreeBuilder().write() >>> for i in range(10): ... path = store_hash(f"{i}") ... content = nar_hash(path) ... tree = update_tree(repo, tree, common.shards(path, depth=5), content) >>> print(tree) 00f68bdb866b654d4ce3da90609b74137605bd90 """ for entry in repo.get(tree): # subdir exists: recurse if (entry.name == path[0]) and (entry.type == "tree"): sub = update_tree(repo, entry.id, path[1:], content) builder = repo.TreeBuilder(repo.get(tree)) builder.remove(path[0]) builder.insert(path[0], sub, git.GIT_FILEMODE_TREE) return builder.write() # subdir does not exist: create required objects if len(path) > 1: # write leaf node sub = update_tree(repo, repo.TreeBuilder().write(), [path[-1]], content) # build intermediate nodes for d in reversed(path[1:-1]): builder = repo.TreeBuilder() builder.insert(d, sub, git.GIT_FILEMODE_TREE) sub = builder.write() # attach to `tree` builder = repo.TreeBuilder(repo.get(tree)) builder.insert(path[0], sub, git.GIT_FILEMODE_TREE) return builder.write() # path[0] is not a subdir: write blob elif len(path) == 1: blob = repo.write(git.GIT_OBJ_BLOB, content) builder = repo.TreeBuilder(repo.get(tree)) builder.insert(path[0], blob, git.GIT_FILEMODE_BLOB) return builder.write() else: raise Exception(f"invalid path: {path}")
def pull_repo(repo_path: Path): """Update a repository at repo_path by pulling from the remote named origin.""" repo = Repository(repo_path) remote = repo.remotes['origin'] remote.fetch() master_id = repo.lookup_reference('refs/remotes/origin/master').target merge_result, _ = repo.merge_analysis(master_id) if merge_result & GIT_MERGE_ANALYSIS_UP_TO_DATE: return if merge_result & GIT_MERGE_ANALYSIS_FASTFORWARD: repo.checkout_tree(repo.get(master_id)) master_ref = repo.lookup_reference('refs/heads/master') master_ref.set_target(master_id) repo.head.set_target(master_id) elif merge_result & GIT_MERGE_ANALYSIS_NORMAL: repo.merge(master_id) assert repo.index.conflicts is None, \ 'Merge conflicts, please manually fix' tree = repo.index.write_tree() repo.create_commit('refs/heads/master', SIGNATURE, SIGNATURE, '[build-server]: Merge', tree, [repo.head.target, master_id]) repo.state_cleanup()
def pull(repo: pygit2.Repository, remote_name: str = 'origin'): """ Pull :param repo: the repository to pull :param remote_name: name of the remote :return: """ for remote in repo.remotes: if remote.name == remote_name: remote.fetch() remote_master_id = repo.lookup_reference('refs/remotes/origin/master').target merge_result, _ = repo.merge_analysis(remote_master_id) # Up to date, do nothing if merge_result & pygit2.GIT_MERGE_ANALYSIS_UP_TO_DATE: return # We can just fastforward elif merge_result & pygit2.GIT_MERGE_ANALYSIS_FASTFORWARD: repo.checkout_tree(repo.get(remote_master_id)) master_ref = repo.lookup_reference('refs/heads/master') master_ref.set_target(remote_master_id) repo.head.set_target(remote_master_id) elif merge_result & pygit2.GIT_MERGE_ANALYSIS_NORMAL: repo.merge(remote_master_id) print(repo.index.conflicts) assert repo.index.conflicts is None, 'Conflicts, ahhhh!' user = repo.default_signature tree = repo.index.write_tree() repo.create_commit('HEAD', user, user, 'Merge!', tree, [repo.head.target, remote_master_id]) repo.state_cleanup() else: raise AssertionError('Unknown merge analysis result')
def _create_corresponding_bug( closing_commit: pygit2.Commit, project_repo: pygit2.Repository, issue_id: tp.Optional[int] = None, creation_date: tp.Optional[datetime] = None, resolution_date: tp.Optional[datetime] = None) -> PygitBug: """ Create the bug corresponding to a given closing commit. Applies simple SZZ algorithm as implemented in pydriller to find introducing commits. Args: closing_commit: commit closing the bug. project_repo: pygit2 repository of the project issue_id: optional issue number related to the bug Returns: the specified bug """ pydrill_repo = pydriller.Git(project_repo.path) introducing_commits: tp.Set[pygit2.Commit] = set() blame_dict = pydrill_repo.get_commits_last_modified_lines( pydrill_repo.get_commit(str(closing_commit.id))) for _, introducing_set in blame_dict.items(): for introducing_id in introducing_set: introducing_commits.add(project_repo.get(introducing_id)) return PygitBug(closing_commit, introducing_commits, issue_id, creation_date, resolution_date)
def getTagList(obj, startJd, endJd): """ returns a list of (epoch, tag_name) tuples """ repo = Repository(obj.vcsDir) startEpoch = getEpochFromJd(startJd) endEpoch = getEpochFromJd(endJd) data = [] for refName in repo.references: if not refName.startswith("refs/tags/"): continue tagName = refName[len("refs/tags/"):] ref = repo.references[refName] tag = repo.get(ref.target) # in some cases, tag is instance of _pygit2.Tag, with tag.author # in other cases, tag is instance of _pygit2.Commit, with tag.author try: author = tag.author except AttributeError: author = tag.tagger epoch = author.time # type: int if epoch < startEpoch: continue if epoch >= endEpoch: break data.append(( epoch, tagName, )) return data
def get_latest_changing_commits_for_tree(repo: git.Repository, root: git.Commit, filter_path: str): """Get the 'latest changing commit' for each file in the tree with path `filter_path` in the commit `root`, like GitHub does in their tree view. While this intuitively is the 'most recent commit to change the file', it is actually the oldest ancestor commit above `root` such that all commits between it and `root` inclusive point to the same blob. Return a compound dict of the form: { 'file1': { 'oid': '<file-blob-sha-in-hex>', 'latest': '<ancestor-commit-sha-in-hex>' }, 'path/file2': { 'oid': '<file-blob-sha-in-hex>', 'latest': '<ancestor-commit-sha-in-hex>' }, } """ commit = root commits = {} resolved = [] tree = _read_tree(repo, root, filter_path) for path in tree: tree[path]['latest'] = commit commits[commit.hex] = {'tree': tree, 'held': {**tree}} pending_commits = set((commit.hex, )) while pending_commits: commit = repo.get(pending_commits.pop()) working = commits[commit.hex]['held'] commits[commit.hex]['held'] = {} for (path, path_dict) in working.items(): assert 'oid' in path_dict assert 'latest' in path_dict found = False for parent_commit in commit.parents: if parent_commit.hex not in commits: parent_tree = _read_tree(repo, parent_commit, filter_path) commits[parent_commit.hex] = { 'tree': parent_tree, 'held': {} } else: parent_tree = commits[parent_commit.hex]['tree'] assert isinstance(parent_tree, dict) parent_path_dict = parent_tree.get(path, None) if parent_path_dict and parent_path_dict['oid'] == path_dict[ 'oid']: found = True path_dict['latest'] = parent_commit commits[parent_commit.hex]['held'][path] = path_dict pending_commits.add(parent_commit.hex) break if not found: resolved.append((path, path_dict)) #resolved.sort(key=lambda t: t[0]) #for (path, path_dict) in resolved: # print(path, path_dict['latest'].message.splitlines()[0]) return dict(resolved)
def save_history_features_graph(repo_path, branch, graph_path): """ Track the number of developers that have worked in a repository and save the results in a graph which could be used for later use. """ repo = Repository(repo_path) head = repo.references.get(branch) commits = list( repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE)) current_commit = repo.head.target start_time = time.time() all_files = {} current_commit = repo.get(str(current_commit)) files = get_files_in_tree(current_commit.tree, repo) for (_, name) in tqdm(files): all_files[name] = {} all_files[name]['lastcommit'] = current_commit.hex all_files[name][current_commit.hex] = {} all_files[name][current_commit.hex]["prevcommit"] = "" all_files[name][current_commit.hex]["authors"] = [ current_commit.committer.name ] for i, commit in enumerate(tqdm(commits[1:])): files = get_diffing_files(commit, commits[i], repo) for (_, name, _) in files: if name not in all_files: all_files[name] = {} last_commit = "" if 'lastcommit' not in all_files[name]: all_files[name]['lastcommit'] = commit.hex else: last_commit = all_files[name]['lastcommit'] all_files[name][commit.hex] = {} all_files[name][commit.hex]["prevcommit"] = last_commit authors = set([commit.committer.name]) if last_commit: authors.update(all_files[name][last_commit]["authors"]) all_files[name][commit.hex]["authors"] = authors all_files[name]['lastcommit'] = commit.hex with open(graph_path, 'w') as output: json.dump(all_files, output, default=set_to_list) end_time = time.time() print("Done") print("Overall processing time {}".format(end_time - start_time))
def get_history_features(graph, repo_path, branch): """ Function that extracts the history features from a git repository. They are the total number of authors, the total age and the total number of unique changes. """ repo = Repository(repo_path) head = repo.references.get(branch) commits = list( repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE)) features = [] commit_feat = [] commit_feat.append(str(commits[0].hex)) commit_feat.append(str(1.0)) commit_feat.append(str(0.0)) commit_feat.append(str(0.0)) features.append(commit_feat) for i, commit in enumerate(tqdm(commits[1:])): files = get_diffing_files(commit, commits[i], repo) total_number_of_authors = set() total_age = [] total_unique_changes = set() for (_, name, _) in files: sub_graph = graph[name][commit.hex] total_number_of_authors.update(sub_graph['authors']) prev_commit = sub_graph['prevcommit'] if prev_commit: total_unique_changes.add(prev_commit) prev_commit_obj = repo.get(prev_commit) total_age.append(commit.commit_time - prev_commit_obj.commit_time) total_age = float(sum(total_age)) / len(total_age) if total_age else 0 commit_feat = [] commit_feat.append(str(commit.hex)) commit_feat.append(str(float(len(total_number_of_authors)))) commit_feat.append(str(float(total_age))) commit_feat.append(str(float(len(total_unique_changes)))) features.append(commit_feat) return features
def give_work(): try: repo = Repository(local_file_path) except: #repo_url = git_File_name #repo_path = local_file_path repo = clone_repository(git_File_name, local_file_path) commits = [] for commit in repo.walk(repo.head.target): commits.append(repo.get(commit.id)) global next_task if next_task < len(commits): commit_hash = commits[next_task] next_task += 1 return jsonify({'commit': str(commit_hash.id), 'id': next_task}) else: return "No Work"
class Repo: def __init__(self, config): self.repo = Repository(config.repository_path) self.config = config self.lock = threading.Lock() master_ref = self.repo.lookup_reference("refs/heads/master") self.repo.checkout(master_ref) self.cred = MyRemoteCallback(config) def lock_patch_work(self, id): self.lock.acquire(True) try: #first lets update master self.repo.remotes[self.config.repository_patch_origin].fetch() #get the latest master master_ref = self.repo.branches.remote[ self.config.repository_patch_origin + '/master'] #In case the branch exists, delete it if id in self.repo.branches: self.repo.branches.delete(id) #create a new branch local = self.repo.branches.local.create(id, master_ref.peel()) #finally switch over self.repo.checkout(local) except Exception as e: self.lock.release() raise e def unlock_patch_work(self, id): try: self.repo.remotes[self.config.repository_patch_destination].push( ["+refs/heads/" + id], callbacks=self.cred) master_ref = self.repo.branches.remote[ self.config.repository_patch_origin + '/master'] self.repo.checkout(master_ref) finally: self.lock.release() def dispatch(self, id): #FIXME also delete on the remote self.repo.branches.delete(id) def fetch_commit_message(self, chash): obj = self.repo.get(chash) return obj.message
def get_experience_features(graph, repo_path, branch): """ Function that extracts the experience features from a experience graph. """ repo = Repository(repo_path) head = repo.references.get(branch) commits = list( repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE)) current_commit = repo.head.target files = get_files_in_tree(repo.get(str(current_commit)).tree, repo) features = [] commit_feat = [] commit_feat.append(str(commits[0].hex)) commit_feat.append(str(1.0)) commit_feat.append(str(len(files))) commit_feat.append(str(0.0)) features.append(commit_feat) for _, commit in enumerate(tqdm(commits[1:])): author = commit.committer.name exp = graph[author][commit.hex]['exp'] rexp = graph[author][commit.hex]['rexp'] try: rrexp = sum([float(float(e[0]) / (float(e[1]) + 1)) for e in rexp]) except: print(author) print(commit.hex) print(rexp) sys.exit(1) commit_feat = [] commit_feat.append(str(commit.hex)) commit_feat.append(str(float(exp))) commit_feat.append(str(float(rrexp))) commit_feat.append(str(float(0))) features.append(commit_feat) return features
def transform_data(input): repo = Repository(get_git_dir()) output = [] for row in input["results"]: if row["command"].endswith("[ERROR]"): continue dir, commit_hash = re.match("^([^ ]+/)?([0-9a-f]+)", row["command"]).groups() try: commit = repo.get(commit_hash) except KeyError: print(f"Skipping commit {commit_hash}", file=sys.stderr) continue for time in row["times"]: output_row = row.copy() output_row["commit"] = f"{commit.commit_time}-{commit_hash}" output_row["message"] = commit.message del output_row["times"] output_row["time"] = time output.append(output_row) return output
class GitBareBackend(object): nb_transactions = 0 def __init__(self, path): self.path = abspath(path) + '/' # Open database self.path_data = '%s/database/' % self.path if not lfs.is_folder(self.path_data): error = '"%s" should be a folder, but it is not' % path raise ValueError, error # Open repository self.repo = Repository(self.path_data) # Read index try: tree = self.repo.head.peel(GIT_OBJ_TREE) self.repo.index.read_tree(tree.id) except: pass # Check git commiter try: _, _ = self.username, self.useremail except: print '=========================================' print 'ERROR: Please configure GIT commiter via' print ' $ git config --global user.name' print ' $ git config --global user.email' print '=========================================' raise @classmethod def init_backend(cls, path, init=False, soft=False): init_repository('{0}/database'.format(path), bare=True) ####################################################################### # Internal utility functions ####################################################################### def _call(self, command): """Interface to cal git.git for functions not yet implemented using libgit2. """ popen = Popen(command, stdout=PIPE, stderr=PIPE, cwd=self.path_data) stdoutdata, stderrdata = popen.communicate() if popen.returncode != 0: raise EnvironmentError, (popen.returncode, stderrdata) return stdoutdata @lazy def username(self): cmd = ['git', 'config', '--get', 'user.name'] try: username = self._call(cmd).rstrip() except EnvironmentError: raise ValueError( "Please configure 'git config --global user.name'") return username @lazy def useremail(self): cmd = ['git', 'config', '--get', 'user.email'] try: useremail = self._call(cmd).rstrip() except EnvironmentError: raise ValueError( "Please configure 'git config --global user.email'") return useremail def _resolve_reference(self, reference): """This method returns the SHA the given reference points to. For now only HEAD is supported. FIXME This is quick & dirty. TODO Implement references in pygit2 and use them here. """ # Case 1: SHA if len(reference) == 40: return reference # Case 2: reference reference = self.repo.lookup_reference(reference) try: reference = reference.resolve() except KeyError: return None return reference.target def normalize_key(self, path, __root=None): # Performance is critical so assume the path is already relative to # the repository. key = __root.resolve(path) if key and key[0] == '.git': err = "bad '{0}' path, access to the '.git' folder is denied" raise ValueError(err.format(path)) return '/'.join(key) def handler_exists(self, key): tree = self.repo.head.peel(GIT_OBJ_TREE) try: tree[key] except: return False return True def get_handler_names(self, key): try: tree = self.repo.head.peel(GIT_OBJ_TREE) if key: tree_entry = tree[key] if tree_entry.type == 'blob': raise ValueError tree = self.repo[tree_entry.id] except: yield None else: for item in tree: yield item.name def get_handler_data(self, key): tree = self.repo.head.peel(GIT_OBJ_TREE) tree_entry = tree[key] blob = self.repo[tree_entry.id] return blob.data def get_handler_mimetype(self, key): data = self.get_handler_data(key) return magic_from_buffer(data) def handler_is_file(self, key): return not self.handler_is_folder(key) def handler_is_folder(self, key): repository = self.repo if key == '': return True else: tree = repository.head.peel(GIT_OBJ_TREE) tree_entry = tree[key] return tree_entry.type == 'tree' def get_handler_mtime(self, key): # FIXME return datetime.utcnow().replace(tzinfo=fixed_offset(0)) def traverse_resources(self): tree = self.repo.head.peel(GIT_OBJ_TREE) yield self.get_resource('/') for name in self.get_names(tree): if name[-9:] == '.metadata' and name != '.metadata': yield self.get_resource('/' + name[:-9]) def get_names(self, tree, path=''): for entry in tree: base_path = '{0}/{1}'.format(path, entry.name) yield base_path if entry.filemode == GIT_FILEMODE_TREE: sub_tree = self.repo.get(entry.hex) for x in self.get_names(sub_tree, base_path): yield x def do_transaction(self, commit_message, data, added, changed, removed, handlers): self.nb_transactions += 1 # Get informations git_author, git_date, git_msg, docs_to_index, docs_to_unindex = data git_msg = commit_message or git_msg or 'no comment' # List of Changed added_and_changed = list(added) + list(changed) # Build the tree from index index = self.repo.index for key in added_and_changed: handler = handlers.get(key) blob_id = self.repo.create_blob(handler.to_str()) entry = IndexEntry(key, blob_id, GIT_FILEMODE_BLOB_EXECUTABLE) index.add(entry) for key in removed: index.remove(key) git_tree = index.write_tree() # Commit self.git_commit(git_msg, git_author, git_date, tree=git_tree) def git_commit(self, message, author=None, date=None, tree=None): """Equivalent to 'git commit', we must give the message and we can also give the author and date. """ # Tree if tree is None: #tree = self.index.write_tree() raise ValueError('Please give me a tree') # Parent parent = self._resolve_reference('HEAD') parents = [parent] if parent else [] # Committer when_time = time.time() when_offset = -(time.altzone if time.daylight else time.timezone) when_offset = when_offset / 60 name = self.username email = self.useremail committer = Signature(name, email, when_time, when_offset) # Author if author is None: author = (name, email) if date: if date.tzinfo: from pytz import utc when_time = date.astimezone(utc) # To UTC when_time = when_time.timetuple() # As struct_time when_time = timegm(when_time) # To unix time when_offset = date.utcoffset().seconds / 60 else: err = "Worktree.git_commit doesn't support naive datatime yet" raise NotImplementedError, err author = Signature(author[0], author[1], when_time, when_offset) # Create the commit return self.repo.create_commit('HEAD', author, committer, message, tree, parents) def abort_transaction(self): # TODO: Remove created blobs pass
def get_general_data(repo_path, issue_path, labels, pairs): """ Function to get general statistics for a git repository. """ repo = Repository(repo_path) issue_list = {} labeled_commits = {} with open(labels, 'r') as inp: reader = csv.reader(inp) next(reader) for commit in reader: labeled_commits[commit[0]] = float(commit[1]) print("Number of commits: {}".format(len(labeled_commits))) print("Number of found bugintroducing commits: {}".format( len([ labeled_commits[f] for f in labeled_commits if labeled_commits[f] > 0 ]))) pair_map = [] with open(pairs, 'r') as inp: pair_map = json.load(inp) total_fixes = set([p[0] for p in pair_map]) print("Total number of fixes used: {}".format(len(total_fixes))) bug_labeled_commits = set( [l for l in labeled_commits if labeled_commits[l] > 0]) fixes_in_bugs = set(bug_labeled_commits).intersection(total_fixes) print("Total number of fixes in bugs found : {}".format( len(fixes_in_bugs))) time_diff = [] for pair in pair_map: fix = repo.get(pair[0]) bug = repo.get(pair[1]) fix_date = datetime.fromtimestamp(fix.commit_time).replace(tzinfo=None) bug_date = datetime.fromtimestamp(bug.commit_time).replace(tzinfo=None) diff = (fix_date - bug_date).days time_diff.append(diff) years, days = divmod(float(mean(time_diff)), 365.25) myears, mdays = divmod(float(median(time_diff)), 365.25) print( "Average time between bug introduction and fix: {} years and {} days". format(years, days)) print("Median time between bug introduction and fix: {} years and {} days". format(myears, mdays)) with open(issue_path, 'r') as inp: issue_list = json.load(inp) print("Total number of fixes found: {}".format(len(issue_list))) save_commit_messages([repo.get(c) for c in bug_labeled_commits], repo) get_average_time_issues(issue_path)
if __name__ == '__main__': if len(sys.argv) != 3: print("Correct usage: script, localfilepath, gitrepopath") exit() local_file_path = str(sys.argv[1]) git_File_name = str(sys.argv[2]) bool = True executiontime_list = [] result_list = [] id = 0 while bool: #run until work is finished try: repo = Repository(local_file_path) except: #repo_url = 'https://github.com/rubik/radon.git' #repo_path = 'D:/Users/AJ/PycharmProjects/untitled1/radon' repo = clone_repository(git_File_name, local_file_path) commits = [] for commit in repo.walk(repo.head.target): commits.append(repo.get(commit.id)) try: work, id, executiontime = get_work(repo) print(id) executiontime_list.append(executiontime) except: bool = False print("Process Terminated") report = {'executiontime': executiontime_list} send_results(report)
class GitRepo(object): ''' git repo class ''' def __init__(self, path): try: self.__repo = Repository(path) except Exception as e: self.__repo = None print(e) def get_info(self): if not self.__repo: return None signature = self.__repo.default_signature result = { 'path': self.__repo.path, 'workdir': self.__repo.workdir, 'bare': self.__repo.is_bare, 'empty': self.__repo.is_empty, 'name': signature.name, 'email': signature.email, 'time': signature.time, 'offset': signature.offset, } return result def get_all_references(self): return self.__repo.listall_references() def get_reference(self, name): if not self.__repo: return None ref = None try: ref = self.__repo.lookup_reference(name) except Exception as e: print(e) return ref def get_all_branches(self, branch_type=None): if not self.__repo: return None if branch_type: return self.__repo.listall_branches(branch_type) r = self.__repo.listall_branches(GIT_BRANCH_LOCAL | GIT_BRANCH_REMOTE) return r def get_branch(self, name, branch_type=GIT_BRANCH_LOCAL): if not self.__repo: return None return self.__repo.lookup_branch(name, branch_type) def check_branch(self, name, branch_type=None): if not branch_type: if '/' in name: branch_type = GIT_BRANCH_REMOTE else: branch_type = GIT_BRANCH_LOCAL try: result = self.get_branch(name, branch_type) return result except Exception as e: print(e) return False def get_current_commit(self): if not self.__repo: return None commit = self.__repo.revparse_single('HEAD') return self.get_commit(commit) def get_commit_by_branch(self, branch): if not self.__repo: return None query = 'refs/' if hasattr(branch, 'remote_name'): query += 'remotes/' else: query += 'heads/' query += branch.branch_name try: ref = self.get_reference(query) commit = ref.target return self.get_commit(commit) except Exception as e: print(e) return None def get_commit_by_tag(self, tag): if self.__repo is None: return None if tag: commit = tag.get_object() return self.get_commit(commit) return None def get_commit(self, oid_or_commit): ''' return a commit w/ json ''' if not self.__repo or not oid_or_commit: return None try: commit = oid_or_commit if not isinstance(oid_or_commit, Commit): commit = self.__repo.get(oid_or_commit) if commit and commit.type == GIT_OBJ_COMMIT: # t1 = self.__repo.revparse_single('HEAD^') # t2 = self.__repo.revparse_single('HEAD^^') # patches = self.__repo.diff(t1, t2) # for p in patches: # print(p.new_file_path) result = { 'id': str(commit.id), 'author': commit.author.name, 'commiter': commit.committer.name, 'message': commit.message, 'message_encoding': commit.message_encoding, 'tree': str(commit.tree_id), 'parent': [str(pid) for pid in commit.parent_ids], 'time': str(commit.commit_time), 'time_offset': str(commit.commit_time_offset), } return result except Exception as e: print(e) return None def get_commits(self, depth=10, oid_or_commit=None): result = [] if depth == 0: return result if oid_or_commit: commit = self.get_commit(oid_or_commit) else: commit = self.get_current_commit() if not commit: return result # TODO: starting from a commit or its parent # TODO: author result.append(commit) depth -= 1 if commit and commit['parent']: for parent in commit['parent']: result.extend(self.get_commits(depth, parent)) return result def get_commits_by_branch(self, name, path=None): if not self.__repo: return None if self.check_branch(name): ref = self.get_reference('refs/heads/' + name) if ref: commit = ref.target commits = self.get_commits(commit) result = {} for key, val in commits.items(): if self.check_commit_by_path(val, path): result[key] = val return result return None def check_tag(self, name): try: ref = self.get_reference('refs/tags/' + name) return ref except Exception: return False def get_commits_by_tag(self, tag, path=None): if not self.__repo: return None if tag: commit = tag.target commits = self.get_commits(commit) result = {} for key, val in commits.items(): if self.check_commit_by_path(val, path): result[key] = val return result return None def check_commit_by_path(self, commit, path): if not commit: return False if path is None or len(path) == 0: return True result = self.get_tree(commit['tree']) if not isinstance(path, list): path = path.strip().split('/') for name in path: name = name.strip() if name in result: oid = result[name] result = self.get_tree(oid) if not result: result = self.get_blob(oid) return result is not None def get_tree(self, oid, ppath=None): if not self.__repo: return None try: tree = self.__repo.get(oid) if tree and tree.type == GIT_OBJ_TREE: result = {} for entry in tree: item = { 'id': str(entry.id) } obj = self.__repo.get(entry.id) if obj.type == GIT_OBJ_BLOB: item['type'] = 'blob' elif obj.type == GIT_OBJ_TREE: item['type'] = 'tree' item['ppath'] = ppath result[entry.name] = item return result except Exception as e: print(e) return None def get_tree_by_commit(self, commit, path=None): if not commit: return None result = self.get_tree(commit['tree']) if not path: return result # if not isinstance(path, list): # path = path.strip().split('/') try: for name in path: oid = result[name]['id'] p = result[name]['ppath'] p = name if not p else p + '/' + name result = self.get_tree(oid, p) if not result: break except Exception as e: print(e) result = None return result def get_current_root(self): tree = self.get_current_commit() if tree: return self.get_tree(tree['tree']) return None def get_whole_tree(self, oid): ''' tree w/ json ''' if not self.__repo: return None result = tree_walker(self.__repo, oid) return result def get_blob(self, oid): ''' blob w/ json ''' if not self.__repo or not oid: return None try: blob = self.__repo.get(oid) if blob and blob.type == GIT_OBJ_BLOB: content = blob.is_binary and None or blob.data.decode( 'utf8', 'ignore') result = { 'id': str(blob.id), 'content': content, 'size': blob.size, } return result except Exception as e: print(e) return None def get_blob_by_commit(self, commit, path=None): try: tree = self.get_tree_by_commit(commit, path[:-1]) oid = tree[path[-1]]['id'] result = self.get_blob(oid) return result except Exception as e: print(e) return None def get_tag(self, oid): ''' blob w/ json ''' if not self.__repo or not oid: return None try: tag = self.__repo.get(oid) if tag and tag.type == GIT_OBJ_TAG: result = { 'id': str(oid), 'name': tag.name, 'target': str(tag.target.id), 'tagger': tag.tagger, 'message': tag.message, } return result except Exception as e: print(e) return None def get_patches(self, a=None, b=None): try: if not a: a = 'HEAD' if not b: b = a + '^' t1 = self.__repo.revparse_single(a) t2 = self.__repo.revparse_single(b) patches = self.__repo.diff(t1, t2) result = [] for patch in patches: p = { 'old_file_path': patch.old_file_path, 'new_file_path': patch.new_file_path, 'old_oid': str(patch.old_oid), 'new_oid': str(patch.new_oid), 'status': patch.status, 'similarity': patch.similarity, 'additions': patch.additions, 'deletions': patch.deletions, 'binary': patch.is_binary, 'hunks': [], } for hunk in patch.hunks: h = { 'old_start': hunk.old_start, 'old_lines': hunk.old_lines, 'new_start': hunk.new_start, 'new_lines': hunk.new_lines, 'lines': hunk.lines, } p['hunks'].append(h) result.append(p) return result except Exception as e: print(e) return None
class GitMiner(BaseMiner): Id_Name_Login = namedtuple("Id_Name_Login", ["id", "name", "login"]) Code_Change = namedtuple("Code_Change", ["commit_id", "filename"]) def __init__(self, args): super().__init__(args) self._initialise_db() if args.dbms == "sqlite": self._conn.execute("PRAGMA foreign_keys=ON") self.email_map = {} self.commit_id = {} self.id_commit = {} self.code_change_map = {} self.__init_user_emails() self._dump_repository() self.aio = args.aio if self.aio: self._create_loop() self.repo = Repository(args.path) self._fetch_references() self._dump_tags() self._fetch_commit_ids() def _create_loop(self): self.loop = asyncio.new_event_loop() def load_from_file(self, file): pass def dump_to_file(self, path): pass def __init_user_emails(self): res = self.execute_query( """ SELECT email, id, login, name FROM contributors WHERE email IS NOT NULL """ ).fetchall() for row in res: self.email_map[row[0]] = self.Id_Name_Login(id=row[1], name=row[2], login=row[3]) def __init_code_change(self): res = self.execute_query( """ SELECT id, commit_id, filename FROM code_change """ ).fetchall() for row in res: self.code_change_map[self.Code_Change(commit_id=row[1], filename=row[2])] = row[0] def _dump_repository(self): logger.info("Dumping Repository...") res = self.execute_query( f""" SELECT repo_id FROM repository WHERE name="{self.repo_name}" and owner="{self.repo_owner}" """ ).fetchone() if res: self._set_repo_id(res[0]) else: repo = RepositoryStruct( name=self.repo_name, owner=self.repo_owner ).process() obj = self.db_schema.repository_object( name=self.repo_name, owner=self.repo_owner, created_at=repo.created_at, updated_at=repo.updated_at, description=repo.description, disk_usage=repo.disk_usage, fork_count=repo.fork_count, url=repo.url, homepage_url=repo.homepage_url, primary_language=repo.primary_language, total_stargazers=repo.stargazer_count, total_watchers=repo.watcher_count, forked_from=repo.forked_from ) self._insert(self.db_schema.repository.insert(), obj) self._set_repo_id() def _fetch_references(self): self.tags, self.branches = [], {} for reference in self.repo.listall_references(): if 'refs/tags' in reference: self.tags.append(reference) else: self.branches[reference] = self.repo.lookup_reference(reference).peel().oid def _dump_tags(self): objects = [] for tag in self.tags: ref = self.repo.lookup_reference(tag) tag_obj = self.repo[ref.target.hex] if isinstance(tag_obj, Tag): name = tag_obj.name msg = tag_obj.message tagged_object = tag_obj.hex tagger = self.__get_user_id(name=tag_obj.tagger.name, email=tag_obj.tagger.email, oid=tagged_object, is_author=False, is_tagger=True) else: name = tag.split('/')[-1] msg = tag_obj.message tagged_object = tag_obj.hex tagger = self.__get_user_id(name=tag_obj.author.name, email=tag_obj.author.email, oid=tagged_object, is_author=True, is_tagger=False) obj = self.db_schema.tags_object( name=name, tagged_object=tagged_object, message=msg, tagger=tagger ) objects.append(obj) self._insert(object_=self.db_schema.tags.insert(), param=objects) @staticmethod def __get_status(status): if status == 1: return 'ADDED' elif status == 2: return 'DELETED' elif status == 3: return 'MODIFIED' elif status == 4: return 'RENAMED' elif status == 5: return 'COPIED' elif status == 6: return 'IGNORED' elif status == 7: return 'UNTRACKED' elif status == 8: return 'TYPECHANGED' else: return None def __init_commits(self, inverse=False): if not inverse: res = self.execute_query( f""" SELECT oid, id FROM commits WHERE repo_id={self.repo_id} """ ).fetchall() for row in res: self.commit_id[row[0]] = row[1] else: res = self._conn.execute( f""" SELECT id, oid FROM commits WHERE repo_id={self.repo_id} """ ).fetchall() for row in res: self.id_commit[row[0]] = row[1] def __get_commit_id(self, oid, pk=None): if not pk: try: return self.commit_id[oid] except KeyError: return None else: try: return self.id_commit[pk] except KeyError: self.__init_commits(inverse=True) res = self.__get_commit_id(oid=None, pk=pk) if not res: raise Exception(f"GitMiner => __get_commit_id: Pk {pk} does not exist!") else: return res def __check_user_id(self, email): try: map_ = self.email_map[email] return [map_.id, map_.login, map_.name] except KeyError: res = self.execute_query( f""" SELECT id, login, name FROM contributors WHERE email="{email}" """ ).fetchone() if res: self.email_map[email] = self.Id_Name_Login(id=res[0], login=res[1], name=res[2]) return res def __update_contributor(self, name, id_, login, email): name = name.replace('"', '""') self.execute_query( f""" UPDATE contributors SET name="{name}" WHERE id={id_} """ ) self.email_map[email] = self.Id_Name_Login(id=id_, login=login, name=name) def __get_user_id(self, name, email, oid, is_author, is_tagger): if not email: email = None if not name: name = None res = self.__check_user_id(email) if not res: user = CommitUserStruct( oid=oid, repo_name=self.repo_name, repo_owner=self.repo_owner, name=name, email=email, is_author=is_author, is_tagger=is_tagger ).process() if user is None: self._dump_anon_user_object(name=name, email=email, object_=self.db_schema.contributors.insert(), locked_insert=LOCKED) else: self._dump_user_object(login=None, user_object=user, object_=self.db_schema.contributors.insert(), locked_insert=LOCKED) return self.__get_user_id(name=name, email=email, oid=oid, is_author=is_author, is_tagger=is_tagger) else: if name == res[2]: return res[0] elif name == res[1]: return res[0] else: self.__update_contributor(name=name, id_=res[0], login=res[1], email=email) return res[0] def _dump_code_change(self, oid): commit = self.repo.get(oid) commit_id = self.__get_commit_id(oid) logger.debug(f"Dumping Code Change for commit_id -> {commit_id}...") code_change = [] if commit: if not commit.parents: diffs = [self.repo.diff("4b825dc642cb6eb9a060e54bf8d69288fbee4904", commit)] else: diffs = [self.repo.diff(i, commit) for i in commit.parents] total_diffs = len(diffs) for diff in diffs: logger.debug(f"Remaining: {total_diffs}") total_diffs -= 1 for patch in diff: obj = self.db_schema.code_change_object( repo_id=self.repo_id, commit_id=commit_id, filename=patch.delta.new_file.path, additions=patch.line_stats[1], deletions=patch.line_stats[2], changes=patch.line_stats[1] + patch.line_stats[2], change_type=self.__get_status(patch.delta.status) ) code_change.append(obj) self._insert(object_=self.db_schema.code_change.insert(), param=code_change) logger.debug(f"Successfully dumped code change for {oid}!") def __get_code_change_id(self, commit_id, filename): try: return self.code_change_map[self.Code_Change(commit_id=commit_id, filename=filename)] except KeyError: return Exception(f"GitMiner => __get_code_change_id: Object does not exist! commit_id={commit_id}, " f"filename:{filename}") def _dump_patches(self, oid): commit = self.repo.get(oid) commit_id = self.__get_commit_id(oid) logger.debug(f"Dumping Patch for commit_id -> {commit_id}...") patches = [] if not commit.parents: diffs = [self.repo.diff("4b825dc642cb6eb9a060e54bf8d69288fbee4904", commit)] else: diffs = [self.repo.diff(i, commit) for i in commit.parents] total_diffs = len(diffs) for diff in diffs: logger.debug(f"Remaining: {total_diffs}") total_diffs -= 1 for patch in diff: obj = self.db_schema.patches_object( code_change_id=self.__get_code_change_id(commit_id, patch.delta.new_file.path), patch=patch.patch ) patches.append(obj) self._insert(object_=self.db_schema.patches.insert(), param=patches) logger.debug(f"Successfully dumped patch for {oid}!") def _dump_commit(self, oid): logger.debug(f"Inserting for commit: {oid}...") commit = self.repo.get(oid) if not commit.parents: diffs = [self.repo.diff("4b825dc642cb6eb9a060e54bf8d69288fbee4904", commit)] else: diffs = [self.repo.diff(i, commit) for i in commit.parents] num_files_changed = 0 additions, deletions = 0, 0 for diff in diffs: num_files_changed += diff.stats.files_changed additions += diff.stats.insertions deletions += diff.stats.deletions author_name = commit.author.name author_email = commit.author.email author_id = self.__get_user_id(name=author_name, email=author_email, oid=oid.hex, is_author=True, is_tagger=False) if \ author_email.strip() else None authored_date = datetime.fromtimestamp(commit.author.time) committer_name = commit.committer.name committer_email = commit.committer.email if committer_email == "*****@*****.**": committer_id = author_id else: committer_id = self.__get_user_id(name=committer_name, email=committer_email, oid=oid.hex, is_author=False, is_tagger=False) if committer_email.strip() else None committed_date = datetime.fromtimestamp(commit.commit_time) message = commit.message if len(commit.parents) > 1: is_merge = 1 else: is_merge = 0 obj = self.db_schema.commits_object( repo_id=self.repo_id, oid=oid.hex, additions=additions, deletions=deletions, author_id=author_id, authored_date=authored_date, committer_id=committer_id, committer_date=committed_date, message=message, num_files_changed=num_files_changed, is_merge=is_merge ) self._insert(object_=self.db_schema.commits.insert(), param=obj) logger.debug(f"Successfully dumped commit: {oid.hex}") def __fetch_branch_commits(self, branch_target): logger.info(f"Ongoing Branch {branch_target[0]}...") for commit in self.repo.walk(branch_target[1], GIT_SORT_TOPOLOGICAL | GIT_SORT_TIME): if commit.oid not in self.commits: self.commits.add(commit.oid) else: break def _fetch_commit_ids(self): try: with open(f"{ROOT}/.gras-cache/{self.repo_name}_commits.txt", "rb") as fp: self.commits = pickle.load(fp) self.commits = [Oid(hex=x) for x in self.commits] logger.info(f"TOTAL COMMITS: {len(self.commits)}") return self.commits except FileNotFoundError: logger.error("Commits file not present, dumping...") self.commits = set() with concurrent.futures.ThreadPoolExecutor(max_workers=THREADS) as executor: process = {executor.submit(self.__fetch_branch_commits, branch_target): branch_target for branch_target in self.branches.items()} for future in concurrent.futures.as_completed(process): branch_target = process[future] logger.info(f"Fetched for {branch_target[0]}, Total: {len(self.commits)}") logger.info(f"TOTAL COMMITS: {len(self.commits)}") with open(f"{ROOT}/.gras-cache/{self.repo_name}_commits.txt", "wb") as fp: temp = [x.hex for x in self.commits] pickle.dump(temp, fp) del temp @timing(name="commits", is_stage=True) def _parse_commits(self): res = self.execute_query( f""" SELECT DISTINCT oid FROM commits """ ).fetchall() dumped_commits = [x[0] for x in res] del res commits = list(self.commits) for i in range(0, len(commits), THREADS): proc = [mp.Process(target=self._dump_commit, args=(oid,)) for oid in commits[i:i + THREADS] if oid.hex not in dumped_commits] for p in proc: p.start() while any([p.is_alive() for p in proc]): continue @timing(name="code change", is_stage=True) def _parse_code_change(self): id_oid = self.execute_query( f""" SELECT id, oid FROM commits """ ).fetchall() dumped_ids = self.execute_query( f""" SELECT DISTINCT commit_id FROM code_change """ ).fetchall() dumped_ids = [x[0] for x in dumped_ids] not_dumped_commits = [x[1] for x in id_oid if x[0] not in dumped_ids] del dumped_ids del id_oid for i in range(0, len(not_dumped_commits), THREADS): proc = [mp.Process(target=self._dump_code_change, args=(oid,)) for oid in not_dumped_commits[i: i + THREADS]] for p in proc: p.start() while any([x.is_alive() for x in proc]): continue @timing(name="patches", is_stage=True) def _parse_patches(self): self.__init_commits(inverse=True) res = self.execute_query( f""" SELECT id, commit_id FROM code_change """ ).fetchall() cc_commit = {} for row in res: cc_commit[row[0]] = row[1] res = self.execute_query( """ SELECT code_change_id FROM patches """ ) not_dumped_commits = set(cc_commit.values()).difference({cc_commit[x[0]] for x in res}) not_dumped_commits = sorted([self.id_commit[id_] for id_ in not_dumped_commits]) del cc_commit for i in range(0, len(not_dumped_commits), THREADS): proc = [mp.Process(target=self._dump_code_change, args=(oid,)) for oid in not_dumped_commits[i: i + THREADS]] for p in proc: p.start() while any([x.is_alive() for x in proc]): continue @timing(name="async -> commits", is_stage=True) async def _async_parse_commits(self): loop = asyncio.get_event_loop() tasks = [loop.run_in_executor(self.executor, self._dump_commit, oid) for oid in self.commits] completed, _ = await asyncio.wait(tasks) for t in completed: logger.info(f"Dumped commit: {t.result()}") @timing(name="async -> code change", is_stage=True) async def _async_parse_code_change(self): loop = asyncio.get_event_loop() tasks = [loop.run_in_executor(self.executor, self._dump_code_change, oid) for oid in self.commits] completed, _ = await asyncio.wait(tasks) for t in completed: logger.info(f"Dumped Code Change for commit: {t.result()}") def process(self): if self.aio: self.loop.run_until_complete(self._parse_commits()) self.loop.run_until_complete(self._parse_code_change()) else: # self._parse_commits() self.__init_commits() self._parse_code_change() # self._parse_patches() def __del__(self): if self.aio: self.loop.close()
class GitStorage(BaseStorage): _backend = None def __init__(self, context, repo_path=None): self.context = context rp = IStorageInfo(context).path try: self.repo = Repository(discover_repository(rp)) except KeyError: # discover_repository may have failed. raise PathNotFoundError('repository does not exist at path') self.checkout() # defaults to HEAD. @property def empty_root(self): return {'': '_empty_root'} def _get_empty_root(self): return self.empty_root def _get_obj(self, path, cls=None): if path == '' and self._commit is None: # special case return self._get_empty_root() if self._commit is None: raise PathNotFoundError('repository is empty') root = self._commit.tree try: breadcrumbs = [] fragments = list(reversed(path.split('/'))) node = root oid = None while fragments: fragment = fragments.pop() if not fragment == '': # no empty string entries, also skips over '//' and # leaves the final node (if directory) as the tree. oid = node[fragment].oid node = self.repo.get(oid) breadcrumbs.append(fragment) if node is None: # strange. Looks like it's either submodules only # have entry nodes or pygit2 doesn't fully support # this. Try to manually resolve the .gitmodules # file. if cls is None: # Only return this if a specific type was not # expected. submods = parse_gitmodules(self.repo.get( root[GIT_MODULE_FILE].oid).data) submod = submods.get('/'.join(breadcrumbs)) if submod: fragments.reverse() return { '': '_subrepo', 'location': submod, 'path': '/'.join(fragments), 'rev': oid.hex, } if node and (cls is None or isinstance(node, cls)): return node except KeyError: # can't find what is needed in repo, raised by pygit2 raise PathNotFoundError('path not found') # not what we were looking for. if cls == Tree: raise PathNotDirError('path not dir') elif cls == Blob: raise PathNotFileError('path not file') raise PathNotFoundError('path not found') @property def _commit(self): return self.__commit @property def rev(self): if self.__commit: return self.__commit.hex return None @property def shortrev(self): # TODO this is an interim solution. if self.rev: return self.rev[:12] def basename(self, name): return name.split('/')[-1] def checkout(self, rev=None): # None maps to the default revision. if rev is None: rev = 'HEAD' try: self.__commit = self.repo.revparse_single(rev) except KeyError: if rev == 'HEAD': # probably a new repo. self.__commit = None return raise RevisionNotFoundError('revision %s not found' % rev) # otherwise a RevisionNotFoundError should be raised. def files(self): def _files(tree, current_path=None): results = [] for node in tree: if current_path: name = '/'.join([current_path, node.name]) else: name = node.name obj = self.repo.get(node.oid) if isinstance(obj, Blob): results.append(name) elif isinstance(obj, Tree): results.extend(_files(obj, name)) return results if not self._commit: return [] results = _files(self._commit.tree) return results def file(self, path): return self._get_obj(path, Blob).data def listdir(self, path): if path: tree = self._get_obj(path, Tree) else: if self._commit is None: return [] tree = self._commit.tree return [entry.name for entry in tree] def format(self, **kw): # XXX backwards compatibility?? return kw def log(self, start, count, branch=None, shortlog=False): """ start and branch are literally the same thing. """ def _log(iterator): for pos, commit in iterator: if pos == count: raise StopIteration yield { 'author': commit.committer.name, 'email': self._commit.committer.email, 'date': self.strftime(committer_dt(commit.committer)), 'node': commit.hex, 'rev': commit.hex, 'desc': commit.message } if start is None: # assumption. start = 'HEAD' try: self.repo.revparse_single(start) except KeyError: return [] try: rev = self.repo.revparse_single(start).hex except KeyError: raise RevisionNotFoundError('revision %s not found' % start) iterator = enumerate(self.repo.walk(rev, GIT_SORT_TIME)) return list(_log(iterator)) def pathinfo(self, path): obj = self._get_obj(path) if isinstance(obj, Blob): return self.format(**{ 'type': 'file', 'basename': self.basename(path), 'size': obj.size, 'date': self.strftime(committer_dt(self._commit.committer)), }) elif isinstance(obj, dict): # special cases are represented as dict. if obj[''] == '_subrepo': return self.format(**{ 'type': 'subrepo', 'date': '', 'size': 0, 'basename': self.basename(path), # extra field. 'obj': obj, }) elif obj[''] == '_empty_root': return self.format(**{ 'type': 'folder', 'date': '', 'size': 0, 'basename': self.basename(path), }) # Assume this is a Tree. return self.format(**{ 'basename': self.basename(path), 'size': 0, 'type': 'folder', 'date': '', }) def branches(self): return tuple( (b, self.repo.lookup_branch(b).target.hex) for b in self.repo.listall_branches() ) def tags(self): return tuple( (b[10:], self.repo.lookup_reference(b).target.hex) for b in self.repo.listall_references() if b.startswith('refs/tags') )
class GitBlack: def __init__(self): self.repo = Repository(".") self.patchers = {} def get_blamed_deltas(self, patch): filename = patch.delta.old_file.path self.patchers[filename] = Patcher(self.repo, filename) hb = HunkBlamer(self.repo, patch) return hb.blames() def group_blame_deltas(self, blames): for delta_blame in blames: commits = tuple(sorted(delta_blame.commits)) self.grouped_deltas.setdefault(commits, []).append(delta_blame.delta) self.progress += 1 now = time.monotonic() if now - self.last_log > 0.04: sys.stdout.write("Reading file {}/{} \r".format( self.progress, self.total)) sys.stdout.flush() self.last_log = now def commit_changes(self): start = time.monotonic() self.grouped_deltas = {} for path, status in self.repo.status().items(): if status & index_statuses: raise GitIndexNotEmpty patches = [] self._file_modes = {} diff = self.repo.diff(context_lines=0, flags=GIT_DIFF_IGNORE_SUBMODULES) for patch in diff: if patch.delta.status != GIT_DELTA_MODIFIED: continue self._file_modes[ patch.delta.old_file.path] = patch.delta.old_file.mode patches.append(patch) self.progress = 0 self.last_log = 0 self.total = len(patches) executor = ThreadPoolExecutor(max_workers=8) tasks = set() for patch in patches: tasks.add(executor.submit(self.get_blamed_deltas, patch)) if len(tasks) > 8: done, not_done = wait(tasks, return_when=FIRST_COMPLETED) for task in done: self.group_blame_deltas(task.result()) tasks -= set(done) for task in tasks: self.group_blame_deltas(task.result()) secs = time.monotonic() - start sys.stdout.write("Reading file {}/{} ({:.2f} secs).\n".format( self.progress, self.total, secs)) start = time.monotonic() self.total = len(self.grouped_deltas) self.progress = 0 self.last_log = 0 for commits, deltas in self.grouped_deltas.items(): blobs = self._create_blobs(deltas) self._commit(commits, blobs) secs = time.monotonic() - start print("Making commit {}/{} ({:.2f} secs).".format( self.progress, self.total, secs)) def _create_blobs(self, deltas): filenames = set() for delta in deltas: self.patchers[delta.filename].apply(delta) filenames.add(delta.filename) blobs = {} for filename in filenames: blob_id = self.repo.create_blob(self.patchers[filename].content()) blobs[filename] = blob_id return blobs def _commit(self, original_commits, blobs): for filename, blob_id in blobs.items(): file_mode = self._file_modes[filename] index_entry = IndexEntry(filename, blob_id, file_mode) self.repo.index.add(index_entry) commits = [self.repo.get(h) for h in original_commits] main_commit = commits[0] if len(commits) > 1: # most recent commit main_commit = sorted(commits, key=commit_datetime)[-1] commit_message = main_commit.message commit_message += "\n\nautomatic commit by git-black, original commits:\n" commit_message += "\n".join( [" {}".format(c) for c in original_commits]) committer = Signature( name=self.repo.config["user.name"], email=self.repo.config["user.email"], ) self.repo.index.write() tree = self.repo.index.write_tree() head = self.repo.head.peel() self.repo.create_commit("HEAD", main_commit.author, committer, commit_message, tree, [head.id]) self.progress += 1 now = time.monotonic() if now - self.last_log > 0.04: sys.stdout.write("Making commit {}/{} \r".format( self.progress, self.total)) sys.stdout.flush() self.last_log = now
class CollectGit(object): """ Small Helper class for small repositories. This does not scale because we hold a lot of data in memory. """ _regex_comment = re.compile( r"(//[^\"\n\r]*(?:\"[^\"\n\r]*\"[^\"\n\r]*)*[\r\n]|/\*([^*]|\*(?!/))*?\*/)(?=[^\"]*(?:\"[^\"]*\"[^\"]*)*$)" ) _regex_jdoc_line = re.compile(r"(- |\+)\s*(\*|/\*).*") def __init__(self, path): if not path.endswith('.git'): if not path.endswith('/'): path += '/' path += '.git' self._log = logging.getLogger(self.__class__.__name__) self._path = path self._repo = Repository(self._path) self._hunks = {} self._file_actions = {} self._bugfix = {} self._msgs = {} self._days = {} self._cdays = {} self._branches = {} self._tags = {} self._dopts = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES self._SIMILARITY_THRESHOLD = 50 self._graph = nx.DiGraph() @classmethod def clone_repo(cls, uri, local_path): project_name = uri.split('/')[-1].split('.git')[0] repo_path = local_path + '/' + project_name + '/' if os.path.isdir(repo_path): c = subprocess.run(['git', 'fetch'], cwd=repo_path, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if c.returncode != 0: err = 'Error pulling repository {} to {}'.format( uri, repo_path) raise Exception(err) else: os.mkdir(repo_path) c = subprocess.run(['git', 'clone', uri, repo_path], cwd=repo_path, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if c.returncode != 0: err = 'Error cloning repository {} to {}'.format( uri, repo_path) raise Exception(err) return repo_path def _changed_lines(self, hunk): added_lines = [] deleted_lines = [] del_line = hunk['old_start'] add_line = hunk['new_start'] for line in hunk['content'].split('\n'): tmp = line[1:].strip() # is_comment = tmp.startswith('//') or tmp.startswith('/*') or tmp.startswith('*') if line.startswith('+'): added_lines.append((add_line, tmp)) del_line -= 1 if line.startswith('-'): deleted_lines.append((del_line, tmp)) add_line -= 1 del_line += 1 add_line += 1 return added_lines, deleted_lines def _comment_only_change(self, content): content = content + '\n' # required for regex to drop comments content = re.sub(self._regex_comment, "", content) removed = '' added = '' for line in content.split('\n'): line = re.sub( r"\s+", " ", line, flags=re.UNICODE ) # replace all kinds of whitespaces (also multiple) with sińgle whitespace if not re.match(self._regex_jdoc_line, line): if line.startswith('-'): removed += line[1:].strip() elif line.startswith('+'): added += line[1:].strip() return removed == added def _blame_lines(self, revision_hash, filepath, strategy, ignore_lines=False, validated_bugfix_lines=False): """We want to find changed lines for one file in one commit (from the previous commit). For this we are iterating over the diff and counting the lines that are deleted (changed) from the original file. We ignore all added lines. ignore_lines is already specific to all changed hunks of the file for which blame_lines is called """ c = self._repo.revparse_single('{}'.format(revision_hash)) self._hunks[revision_hash] = self._get_hunks(c) changed_lines = [] if revision_hash not in self._hunks.keys( ) or not self._hunks[revision_hash]: return changed_lines for h in self._hunks[revision_hash]: if h['new_file'] != filepath: continue # only whitespace or comment changes in the hunk, ignore if strategy == 'code_only' and self._comment_only_change( h['content']): self._log.debug( 'detected whitepace or comment only change in {} for {}'. format(revision_hash, filepath)) continue added, deleted = self._changed_lines(h) for dt in deleted: if dt not in changed_lines and dt[1]: if strategy == 'code_only' and dt[1].startswith( ('//', '/*', '*')): continue # we may only want validated lines if validated_bugfix_lines is not False: if dt[0] not in validated_bugfix_lines: continue # we may ignore lines, e.g., refactorings if ignore_lines: ignore = False for start_line, end_line in ignore_lines: if start_line <= dt[0] <= end_line: ignore = True break # if we hit the line in our ignore list we continue to the next if ignore: # self._log.warn('ignore line {} in file {} in commit {} because of refactoring detection'.format(dt[0], filepath, revision_hash)) continue changed_lines.append(dt) return changed_lines def blame(self, revision_hash, filepath, strategy='code_only', ignore_lines=False, validated_bugfix_lines=False): """Collect a list of commits where the given revision and file were last changed. Uses git blame. :param str revision_hash: Commit for which we want to collect blame commits. :param str filepath: File for which we want to collect blame commits. :rtype: list :returns: A list of tuples of blame commits and the original file for the given parameters. """ commits = [] # - ignore if commit is not in graph if revision_hash not in self._graph: return [] # # - ignore package-info.java # if strategy == 'code_only' and filepath.lower().endswith('package-info.java'): # self._log.debug('skipping blame on revision: {} for file {} because it is package-info.java'.format(revision_hash, filepath)) # return [] # # - ignore test/ /test/ example/ examples/ # if strategy == 'code_only' and re.match(self._regex_test_example, filepath): # self._log.debug('skipping blame on revision: {} for file {} because it is a test or an example'.format(revision_hash, filepath)) # return [] # bail on multiple parents parents = list(self._graph.predecessors(revision_hash)) if len(parents) > 1: self._log.debug( 'skipping blame on revision: {} because it is a merge commit'. format(revision_hash)) return [] changed_lines = self._blame_lines(revision_hash, filepath, strategy, ignore_lines, validated_bugfix_lines) parent_commit = self._repo.revparse_single('{}^'.format(revision_hash)) blame = self._repo.blame(filepath, flags=GIT_BLAME_TRACK_COPIES_SAME_FILE, newest_commit=parent_commit.hex) for lineno, line in changed_lines: # returns blamehunk for specific line try: bh = blame.for_line(lineno) except IndexError as e: # this happens when we have the wrong parent node bla = 'tried to get file: {}, line: {}, revision: {}, blame commit: {}'.format( filepath, lineno, revision_hash, str(bh.orig_commit_id)) self._log.error(bla) raise # this is critical inducing_commit = self._repo.revparse_single(str( bh.orig_commit_id)) # start = bh.orig_start_line_number # lines = bh.lines_in_hunk # final_start = bh.final_start_line_number # print(revision_hash, '->', inducing_commit.hex) # print('original: {}: {}'.format(lineno, line)) # print('{},{}: {},{}'.format(start, lines, final_start, lines)) # blame_lines = [] # for hunk in self._hunks[inducing_commit.hex]: # if hunk['new_file'] != bh.orig_path: # continue # ls = final_start # for i, blame_line in enumerate(hunk['content'].split('\n')): # if blame_line[1:].strip() and line[1:].strip() and blame_line[1:] == line[1:]: # print('blame: {}:{}'.format(ls, blame_line)) # ls += 1 commits.append((inducing_commit.hex, bh.orig_path)) # make unique return list(set(commits)) def commit_information(self, revision_hash): obj = self._repo.get(revision_hash) return { 'author_name': obj.author.name, 'author_email': obj.author.email, 'committer_name': obj.committer.name, 'committer_email': obj.committer.email, 'committer_date_utc': datetime.fromtimestamp(obj.commit_time, tz=timezone.utc), 'committer_date': obj.commit_time, 'committer_date_offset': obj.commit_time_offset, 'message': obj.message, 'file_actions': self._file_actions[revision_hash] } def file_actions(self, revision_hash): return self._file_actions[revision_hash] def all_files(self, revision_hash): # 1. checkout repo self._checkout_revision(revision_hash) # 2. list files return self._list_files() def first_occurence(self, filename): # file rename tracking is not possible currently in libgit, see: # https://github.com/libgit2/libgit2/issues/3041 # find first occurence of file with git cli # git log --follow --diff-filter=A --find-renames=40% foo.js path = self._path.replace('.git', '') c = subprocess.run([ 'git', 'log', '--all', '--pretty=tformat:"%H %ci"', '--follow', '--diff-filter=A', '--find-renames=80%', '--', filename ], cwd=path, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if c.returncode != 0: err = 'Error finding first occurrence of file: {}'.format(filename) self._log.error(err) self._log.error(c.stderr) raise Exception(err) full = c.stdout.decode('utf-8') try: first_line = full.split('\n')[-2] except IndexError: if not full: print('no git log for file {}'.format(filename)) print(full) raise first_date = ' '.join(first_line.split(' ')[1:]).replace('"', '') dt = datetime.strptime( first_date, '%Y-%m-%d %H:%M:%S %z' ) # we can do this here because we control the input format, %z does not cover +01:00 just +100 (at least in 3.6) return dt def tags(self): regex = re.compile('^refs/tags') ret = [] for tagref in filter(lambda r: regex.match(r), self._repo.listall_references()): tag = self._repo.lookup_reference(tagref) target = self._repo.lookup_reference(tagref).peel() ret.append({ 'name': tag.name.replace('refs/tags/', ''), 'revision_hash': target.id }) return ret def _checkout_revision(self, revision): """Checkout via shell, we ignore stdout output.""" path = self._path.replace('.git', '') c = subprocess.run(['git', 'checkout', '-q', '-f', revision], cwd=path, stdout=subprocess.PIPE) return c.returncode == 0 def _list_files(self): """The slower list_files""" path = self._path.replace('.git', '') ret = [] for root, dirs, files in os.walk(path): for file in files: filepath = os.path.join(root, file) relative_filepath = filepath.replace(path, '') ret.append(relative_filepath) return ret def _list_files2(self): """The faster list_files (relies on find command)""" path = self._path.replace('.git', '') lines = subprocess.check_output(['find', '.', '-iname', '*.java'], cwd=path) files = [] for f in lines.decode('utf-8').split('\n'): if f.lower().endswith('.java'): files.append(f.replace('./', '')) return files def _get_hunks(self, commit): diffs = [] hunks = [] # for initial commit (or orphan commits) pygit2 needs some special attention initial = False if not commit.parents: initial = True diffs.append((None, commit.tree.diff_to_tree(context_lines=0, interhunk_lines=1))) # we may have multiple parents (merge commit) for parent in commit.parents: # we need all information from each parent because in a merge each parent may add different files tmp = self._repo.diff(parent, commit, context_lines=0, interhunk_lines=1) tmp.find_similar(self._dopts, self._SIMILARITY_THRESHOLD, self._SIMILARITY_THRESHOLD) diffs.append((parent.hex, tmp)) for parent, diff in diffs: checked_paths = set() for patch in diff: if patch.delta.new_file.path in checked_paths: self._log.warn('already have {} in checked_paths'.format( patch.delta.new_file.path)) continue mode = 'X' if patch.delta.status == 1: mode = 'A' elif patch.delta.status == 2: mode = 'D' elif patch.delta.status == 3: mode = 'M' elif patch.delta.status == 4: mode = 'R' elif patch.delta.status == 5: mode = 'C' elif patch.delta.status == 6: mode = 'I' elif patch.delta.status == 7: mode = 'U' elif patch.delta.status == 8: mode = 'T' # diff to tree gives D for inital commit otherwise if initial: mode = 'A' # we may have hunks to add if patch.hunks and commit.hex not in self._hunks.keys(): self._hunks[commit.hex] = [] # add hunks for hunk in patch.hunks: # initial is special case if initial: content = ''.join( ['+' + l.content for l in hunk.lines]) hunks.append({ 'header': hunk.header, 'new_file': patch.delta.new_file.path, 'new_start': hunk.old_start, 'new_lines': hunk.old_lines, 'old_start': hunk.new_start, 'old_lines': hunk.new_lines, 'content': content }) else: content = ''.join( [l.origin + l.content for l in hunk.lines]) hunks.append({ 'header': hunk.header, 'new_file': patch.delta.new_file.path, 'new_start': hunk.new_start, 'new_lines': hunk.new_lines, 'old_start': hunk.old_start, 'old_lines': hunk.old_lines, 'content': content }) return hunks def _changed_files(self, commit): changed_files = [] diffs = [] # for initial commit (or orphan commits) pygit2 needs some special attention initial = False if not commit.parents: initial = True diffs.append((None, commit.tree.diff_to_tree(context_lines=0, interhunk_lines=1))) # we may have multiple parents (merge commit) for parent in commit.parents: # we need all information from each parent because in a merge each parent may add different files tmp = self._repo.diff(parent, commit, context_lines=0, interhunk_lines=1) tmp.find_similar(self._dopts, self._SIMILARITY_THRESHOLD, self._SIMILARITY_THRESHOLD) diffs.append((parent.hex, tmp)) for parent, diff in diffs: checked_paths = set() for patch in diff: if patch.delta.new_file.path in checked_paths: self._log.warn('already have {} in checked_paths'.format( patch.delta.new_file.path)) continue mode = 'X' if patch.delta.status == 1: mode = 'A' elif patch.delta.status == 2: mode = 'D' elif patch.delta.status == 3: mode = 'M' elif patch.delta.status == 4: mode = 'R' elif patch.delta.status == 5: mode = 'C' elif patch.delta.status == 6: mode = 'I' elif patch.delta.status == 7: mode = 'U' elif patch.delta.status == 8: mode = 'T' # diff to tree gives D for inital commit otherwise if initial: mode = 'A' # we may have hunks to add if patch.hunks and commit.hex not in self._hunks.keys(): self._hunks[commit.hex] = [] # add hunks for hunk in patch.hunks: # initial is special case if initial: content = ''.join( ['+' + l.content for l in hunk.lines]) self._hunks[commit.hex].append({ 'header': hunk.header, 'new_file': patch.delta.new_file.path, 'new_start': hunk.old_start, 'new_lines': hunk.old_lines, 'old_start': hunk.new_start, 'old_lines': hunk.new_lines, 'content': content }) else: content = ''.join( [l.origin + l.content for l in hunk.lines]) self._hunks[commit.hex].append({ 'header': hunk.header, 'new_file': patch.delta.new_file.path, 'new_start': hunk.new_start, 'new_lines': hunk.new_lines, 'old_start': hunk.old_start, 'old_lines': hunk.old_lines, 'content': content }) # collect line stats if initial: fa = { 'lines_added': patch.line_stats[2], 'lines_deleted': patch.line_stats[1], 'changeset_size': len(diff), 'parent': None } else: fa = { 'lines_added': patch.line_stats[1], 'lines_deleted': patch.line_stats[2], 'changeset_size': len(diff), 'parent': parent } #if mode == 'R': # print('R {} -> {}, sim: {}'.format(patch.delta.old_file.path, patch.delta.new_file.path, patch.delta.similarity)) if mode in ['C', 'R']: changed_file = [ mode, patch.delta.new_file.path, patch.delta.old_file.path, fa ] else: changed_file = [mode, patch.delta.new_file.path, None, fa] checked_paths.add(patch.delta.new_file.path) changed_files.append(changed_file) return changed_files def collect(self): # list all branches for branch in list(self._repo.branches): self._collect_branch(branch) # list all tags for obj in self._repo: tag = self._repo[obj] if tag.type == GIT_OBJ_TAG: self._collect_branch(tag, is_tag=True) return self._graph def _collect_branch(self, branch, is_tag=False): if type(branch) == str: branch = self._repo.branches[branch] # add nodes to graph try: for c in self._repo.walk(branch.target): self._graph.add_node(c.hex) # branch stuff, used for traversing backwards for tags in svn->git conversions # if c.hex not in self._branches.keys(): # self._branches[c.hex] = [] # what about tags which are also on branches? # if is_tag: # self._tags[c.hex] = branch.name # else: # self._branches[c.hex].append(branch.branch_name) # add msg # self._msgs[c.hex] = c.message # add days, we use this later for lookup # day = str(datetime.fromtimestamp(c.commit_time, tz=timezone.utc).date()) # if day not in self._days.keys(): # self._days[day] = [] # self._days[day].append(c.hex) # add for convenience for OntdekBaanBfs # self._cdays[c.hex] = day # add changed files per node # if c.hex not in self._file_actions.keys(): # self._file_actions[c.hex] = self._changed_files(c) # still too expensive # self._create_hunks(c) # add edges to graph for c in self._repo.walk(branch.target): for p in c.parents: self._graph.add_edge(p.hex, c.hex) except ValueError as e: pass
def save_experience_features_graph(repo_path, branch, graph_path): """ Function to get and save the experience graph. """ repo = Repository(repo_path) head = repo.references.get(branch) commits = list( repo.walk(head.target, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE)) current_commit = repo.head.target start_time = time.time() current_commit = repo.get(str(current_commit)) files = get_files_in_tree(current_commit.tree, repo) all_authors = {} author = current_commit.committer.name all_authors[author] = {} all_authors[author]['lastcommit'] = current_commit.hex all_authors[author][current_commit.hex] = {} all_authors[author][current_commit.hex]['prevcommit'] = "" all_authors[author][current_commit.hex]["exp"] = 1 all_authors[author][current_commit.hex]["rexp"] = [[len(files), 1]] all_authors[author][current_commit.hex]["sexp"] = {} for i, commit in enumerate(tqdm(commits[1:])): files = get_diffing_files(commit, commits[i], repo) author = commit.committer.name if author not in all_authors: all_authors[author] = {} all_authors[author]['lastcommit'] = commit.hex all_authors[author][commit.hex] = {} all_authors[author][commit.hex]['prevcommit'] = "" all_authors[author][commit.hex]["exp"] = 1 all_authors[author][commit.hex]["rexp"] = [[len(files), 1.0]] all_authors[author][commit.hex]["sexp"] = {} else: last_commit = all_authors[author]["lastcommit"] all_authors[author]["lastcommit"] = commit.hex all_authors[author][commit.hex] = {} all_authors[author][commit.hex]['prevcommit'] = last_commit all_authors[author][commit.hex][ 'exp'] = 1 + all_authors[author][last_commit]['exp'] date_current = datetime.fromtimestamp(commit.commit_time) date_last = datetime.fromtimestamp( repo.get(last_commit).commit_time) diffing_years = abs( floor(float((date_current - date_last).days) / 365)) overall = all_authors[author][last_commit]['rexp'] all_authors[author][commit.hex]['rexp'] = [[ len(files), 1.0 ]] + [[e[0], e[1] + diffing_years] for e in overall] with open(graph_path, 'w') as output: json.dump(all_authors, output, default=set_to_list) end_time = time.time() print("Done") print("Overall processing time {}".format(end_time - start_time))
class GitRepo: """A class that manages a git repository. This class enables versiong via git for a repository. You can stage and commit files and checkout different commits of the repository. """ path = '' pathspec = [] repo = None callback = None author_name = 'QuitStore' author_email = '*****@*****.**' gcProcess = None def __init__(self, path, origin=None, gc=False): """Initialize a new repository from an existing directory. Args: path: A string containing the path to the repository. origin: The remote URL where to clone and fetch from and push to """ logger = logging.getLogger('quit.core.GitRepo') logger.debug('GitRepo, init, Create an instance of GitStore') self.path = path self.gc = gc if not exists(path): try: makedirs(path) except OSError as e: raise Exception('Can\'t create path in filesystem:', path, e) try: self.repo = Repository(path) except KeyError: pass except AttributeError: pass if origin: self.callback = QuitRemoteCallbacks() if self.repo: if self.repo.is_bare: raise QuitGitRepoError('Bare repositories not supported, yet') if origin: # set remote self.addRemote('origin', origin) else: if origin: # clone self.repo = self.cloneRepository(origin, path, self.callback) else: self.repo = init_repository(path=path, bare=False) def cloneRepository(self, origin, path, callback): try: repo = clone_repository(url=origin, path=path, bare=False, callbacks=callback) return repo except Exception as e: raise QuitGitRepoError( "Could not clone from: {} origin. {}".format(origin, e)) def addall(self): """Add all (newly created|changed) files to index.""" self.repo.index.read() self.repo.index.add_all(self.pathspec) self.repo.index.write() def addfile(self, filename): """Add a file to the index. Args: filename: A string containing the path to the file. """ index = self.repo.index index.read() try: index.add(filename) index.write() except Exception as e: logger.info( "GitRepo, addfile, Could not add file {}.".format(filename)) logger.debug(e) def addRemote(self, name, url): """Add a remote. Args: name: A string containing the name of the remote. url: A string containing the url to the remote. """ try: self.repo.remotes.create(name, url) logger.info("Successfully added remote: {} - {}".format(name, url)) except Exception as e: logger.info("Could not add remote: {} - {}".format(name, url)) logger.debug(e) try: self.repo.remotes.set_push_url(name, url) self.repo.remotes.set_url(name, url) except Exception as e: logger.info("Could not set push/fetch urls: {} - {}".format( name, url)) logger.debug(e) def checkout(self, commitid): """Checkout a commit by a commit id. Args: commitid: A string cotaining a commitid. """ try: commit = self.repo.revparse_single(commitid) self.repo.set_head(commit.oid) self.repo.reset(commit.oid, GIT_RESET_HARD) logger.info("Checked out commit: {}".format(commitid)) except Exception as e: logger.info("Could not check out commit: {}".format(commitid)) logger.debug(e) def commit(self, message=None): """Commit staged files. Args: message: A string for the commit message. Raises: Exception: If no files in staging area. """ if self.isstagingareaclean(): # nothing to commit return index = self.repo.index index.read() tree = index.write_tree() try: author = Signature(self.author_name, self.author_email) comitter = Signature(self.author_name, self.author_email) if len(self.repo.listall_reference_objects()) == 0: # Initial Commit if message is None: message = 'Initial Commit from QuitStore' self.repo.create_commit('HEAD', author, comitter, message, tree, []) else: if message is None: message = 'New Commit from QuitStore' self.repo.create_commit('HEAD', author, comitter, message, tree, [self.repo.head.get_object().hex]) logger.info('Updates commited') except Exception as e: logger.info('Nothing to commit') logger.debug(e) if self.gc: self.garbagecollection() def commitexists(self, commitid): """Check if a commit id is part of the repository history. Args: commitid: String of a Git commit id. Returns: True, if commitid is part of commit log False, else. """ if commitid in self.getids(): return True else: return False def garbagecollection(self): """Start garbage collection. Args: commitid: A string cotaining a commitid. """ try: # Check if the garbage collection process is still running if self.gcProcess is None or self.gcProcess.poll() is not None: # Start garbage collection with "--auto" option, # which imidietly terminates, if it is not necessary self.gcProcess = Popen(["git", "gc", "--auto", "--quiet"], cwd=self.path) logger.debug('Spawn garbage collection') except Exception as e: logger.debug('Git garbage collection failed to spawn') logger.debug(e) def getpath(self): """Return the path of the git repository. Returns: A string containing the path to the directory of git repo """ return self.path def getcommits(self): """Return meta data about exitsting commits. Returns: A list containing dictionaries with commit meta data """ commits = [] if len(self.repo.listall_reference_objects()) > 0: for commit in self.repo.walk(self.repo.head.target, GIT_SORT_REVERSE): commits.append({ 'id': str(commit.oid), 'message': str(commit.message), 'commit_date': datetime.fromtimestamp( commit.commit_time).strftime('%Y-%m-%dT%H:%M:%SZ'), 'author_name': commit.author.name, 'author_email': commit.author.email, 'parents': [c.hex for c in commit.parents], }) return commits def getids(self): """Return meta data about exitsting commits. Returns: A list containing dictionaries with commit meta data """ ids = [] if len(self.repo.listall_reference_objects()) > 0: for commit in self.repo.walk(self.repo.head.target, GIT_SORT_REVERSE): ids.append(str(commit.oid)) return ids def isgarbagecollectionon(self): """Return if gc is activated or not. Returns: True, if activated False, if not """ return self.gc def isstagingareaclean(self): """Check if staging area is clean. Returns: True, if staginarea is clean False, else. """ status = self.repo.status() for filepath, flags in status.items(): if flags != GIT_STATUS_CURRENT: return False return True def pull(self, remote='origin', branch='master'): """Pull if possible. Return: True: If successful. False: If merge not possible or no updates from remote. """ try: self.repo.remotes[remote].fetch() except Exception as e: logger.info("Can not pull: Remote {} not found.".format(remote)) logger.debug(e) ref = 'refs/remotes/' + remote + '/' + branch remoteid = self.repo.lookup_reference(ref).target analysis, _ = self.repo.merge_analysis(remoteid) if analysis & GIT_MERGE_ANALYSIS_UP_TO_DATE: # Already up-to-date pass elif analysis & GIT_MERGE_ANALYSIS_FASTFORWARD: # fastforward self.repo.checkout_tree(self.repo.get(remoteid)) master_ref = self.repo.lookup_reference('refs/heads/master') master_ref.set_target(remoteid) self.repo.head.set_target(remoteid) elif analysis & GIT_MERGE_ANALYSIS_NORMAL: self.repo.merge(remoteid) tree = self.repo.index.write_tree() msg = 'Merge from ' + remote + ' ' + branch author = Signature(self.author_name, self.author_email) comitter = Signature(self.author_name, self.author_email) self.repo.create_commit('HEAD', author, comitter, msg, tree, [self.repo.head.target, remoteid]) self.repo.state_cleanup() else: logger.debug('Can not pull. Unknown merge analysis result') def push(self, remote='origin', branch='master'): """Push if possible. Return: True: If successful. False: If diverged or nothing to push. """ ref = ['refs/heads/' + branch] try: remo = self.repo.remotes[remote] except Exception as e: logger.info( "Can not push. Remote: {} does not exist.".format(remote)) logger.debug(e) return try: remo.push(ref, callbacks=self.callback) except Exception as e: logger.info("Can not push to {} with ref {}".format( remote, str(ref))) logger.debug(e) def getRemotes(self): remotes = {} try: for remote in self.repo.remotes: remotes[remote.name] = [remote.url, remote.push_url] except Exception as e: logger.info('No remotes found.') logger.debug(e) return {} return remotes