def test_get_all_commits(): gr = GitRepository('test-repos/git-1/') change_sets = list(gr.get_list_commits()) assert len(change_sets) == 13 assert change_sets[0].hash == '866e997a9e44cb4ddd9e00efe49361420aff2559' assert change_sets[12].hash == 'e7d13b0511f8a176284ce4f92ed8c6e8d09c77f2'
def traverse_commits(self) -> Generator[Commit, None, None]: """ Analyze all the specified commits (all of them by default), returning a generator of commits. """ if isinstance(self._path_to_repo, str): self._path_to_repo = [self._path_to_repo] for path_repo in self._path_to_repo: # if it is a remote repo, clone it first in a temporary folder! if self._isremote(path_repo): tmp_folder = tempfile.TemporaryDirectory() path_repo = self._clone_remote_repos(tmp_folder.name, path_repo) git_repo = GitRepository(path_repo) self._sanity_check_filters(git_repo) self._check_timezones() logger.info('Analyzing git repository in {}'.format(git_repo.path)) for commit in git_repo.get_list_commits(self._only_in_branch, not self._reversed_order): logger.info('Commit #{} in {} from {}'.format( commit.hash, commit.committer_date, commit.author.name)) if self._is_commit_filtered(commit): logger.info('Commit #{} filtered'.format(commit.hash)) continue yield commit
def traverse_commits(self) -> Generator[Commit, None, None]: """ Analyze all the specified commits (all of them by default), returning a generator of commits. """ for path_repo in self._conf.get('path_to_repos'): # if it is a remote repo, clone it first in a temporary folder! if self._is_remote(path_repo): if self._conf.get('clone_repo_to'): clone_folder = str(Path(self._conf.get('clone_repo_to'))) if not os.path.isdir(clone_folder): raise Exception( "Not a directory: {0}".format(clone_folder)) path_repo = self._clone_remote_repos( clone_folder, path_repo) else: tmp_folder = tempfile.TemporaryDirectory() path_repo = self._clone_remote_repos( tmp_folder.name, path_repo) git_repo = GitRepository(path_repo, self._conf) self._conf.set_value("git_repo", git_repo) self._conf.sanity_check_filters() logger.info('Analyzing git repository in %s', git_repo.path) # Get the commits that modified the filepath. In this case, we can not use # git rev-list since it doesn't have the option --follow, necessary to follow # the renames. Hence, we manually call git log instead if self._conf.get('filepath') is not None: self._conf.set_value( 'filepath_commits', git_repo.get_commits_modified_file( self._conf.get('filepath'))) # Gets only the commits that are tagged if self._conf.get('only_releases'): self._conf.set_value('tagged_commits', git_repo.get_tagged_commits()) # Build the arguments to pass to git rev-list. rev, kwargs = self._conf.build_args() # Iterate over all the commits returned by git rev-list for commit in git_repo.get_list_commits(rev, **kwargs): logger.info('Commit #%s in %s from %s', commit.hash, commit.committer_date, commit.author.name) if self._conf.is_commit_filtered(commit): logger.info('Commit #%s filtered', commit.hash) continue yield commit # cleaning, this is necessary since GitPython issues on memory leaks self._conf.set_value("git_repo", None) git_repo.clear()
def traverse_commits(self) -> Generator[Commit, None, None]: """ Analyze all the specified commits (all of them by default), returning a generator of commits. """ for path_repo in self._conf.get('path_to_repos'): if self._is_remote(path_repo): path_repo = self._clone_remote_repos(self._clone_folder(), path_repo) git_repo = GitRepository(path_repo, self._conf) # saving the GitRepository object for further use self._conf.set_value("git_repo", git_repo) # when multiple repos are given in input, this variable will serve as a reminder # of which one we are currently analyzing self._conf.set_value('path_to_repo', path_repo) # checking that the filters are set correctly self._conf.sanity_check_filters() logger.info('Analyzing git repository in %s', git_repo.path) # Get the commits that modified the filepath. In this case, we can not use # git rev-list since it doesn't have the option --follow, necessary to follow # the renames. Hence, we manually call git log instead if self._conf.get('filepath') is not None: self._conf.set_value( 'filepath_commits', git_repo.get_commits_modified_file( self._conf.get('filepath'))) # Gets only the commits that are tagged if self._conf.get('only_releases'): self._conf.set_value('tagged_commits', git_repo.get_tagged_commits()) # Build the arguments to pass to git rev-list. rev, kwargs = self._conf.build_args() # Iterate over all the commits returned by git rev-list for commit in git_repo.get_list_commits(rev, **kwargs): logger.info('Commit #%s in %s from %s', commit.hash, commit.committer_date, commit.author.name) if self._conf.is_commit_filtered(commit): logger.info('Commit #%s filtered', commit.hash) continue yield commit # cleaning, this is necessary since GitPython issues on memory leaks self._conf.set_value("git_repo", None) git_repo.clear()
def test_list_commits(repo: GitRepository): change_sets = list(repo.get_list_commits()) list_commits = {'a88c84ddf42066611e76e6cb690144e5357d132c', '6411e3096dd2070438a17b225f44475136e54e3a', '09f6182cef737db02a085e1d018963c7a29bde5a', '1f99848edadfffa903b8ba1286a935f1b92b2845', 'da39b1326dbc2edfe518b90672734a08f3c13458'} for commit in change_sets: assert commit.hash in list_commits assert len(change_sets) == 5
def traverse_commits(self) -> Generator[Commit, None, None]: """ Analyze all the specified commits (all of them by default), returning a generator of commits. """ for path_repo in self._conf.get('path_to_repos'): # if it is a remote repo, clone it first in a temporary folder! if self._is_remote(path_repo): if self._conf.get('clone_repo_to'): clone_folder = str(Path(self._conf.get('clone_repo_to'))) if not os.path.isdir(clone_folder): raise Exception("Not a directory: " \ "{0}".format(clone_folder)) path_repo = self._clone_remote_repos( clone_folder, path_repo) else: tmp_folder = tempfile.TemporaryDirectory() path_repo = self._clone_remote_repos( tmp_folder.name, path_repo) git_repo = GitRepository(path_repo, self._conf) self._conf.set_value("git_repo", git_repo) self._conf.sanity_check_filters() logger.info('Analyzing git repository in %s', git_repo.path) if self._conf.get('filepath') is not None: self._conf.set_value( 'filepath_commits', git_repo.get_commits_modified_file( self._conf.get('filepath'))) if self._conf.get('only_releases'): self._conf.set_value('tagged_commits', git_repo.get_tagged_commits()) for commit in git_repo.get_list_commits( self._conf.get('only_in_branch'), not self._conf.get('reversed_order')): logger.info('Commit #%s in %s from %s', commit.hash, commit.committer_date, commit.author.name) if self._conf.is_commit_filtered(commit): logger.info('Commit #%s filtered', commit.hash) continue yield commit # cleaning self._conf.set_value("git_repo", None) git_repo.clear()
def test_list_commits(): gr = GitRepository('test-repos/test1/') assert gr is not None change_sets = gr.get_list_commits() list_commits = [ 'a88c84ddf42066611e76e6cb690144e5357d132c', '6411e3096dd2070438a17b225f44475136e54e3a', '09f6182cef737db02a085e1d018963c7a29bde5a', '1f99848edadfffa903b8ba1286a935f1b92b2845', 'da39b1326dbc2edfe518b90672734a08f3c13458' ] for commit in change_sets: assert commit.hash in list_commits assert 5 == len(change_sets)
def traverse_commits(self) -> Generator[Commit, None, None]: """ Analyze all the specified commits (all of them by default), returning a generator of commits. """ for path_repo in self._path_to_repo: # if it is a remote repo, clone it first in a temporary folder! if self._isremote(path_repo): tmp_folder = tempfile.TemporaryDirectory() path_repo = self._clone_remote_repos(tmp_folder.name, path_repo) git_repo = GitRepository(path_repo) self._sanity_check_filters(git_repo) self._check_timezones() logger.info('Analyzing git repository in %s', git_repo.path) if self._filepath is not None: self._filepath_commits = git_repo.get_commits_modified_file( self._filepath) if self._only_releases: self._tagged_commits = git_repo.get_tagged_commits() for commit in git_repo.get_list_commits(self._only_in_branch, not self._reversed_order): logger.info('Commit #%s in %s from %s', commit.hash, commit.committer_date, commit.author.name) if self._is_commit_filtered(commit): logger.info('Commit #%s filtered', commit.hash) continue yield commit
class RepositoryMining: def __init__(self, path_to_repo, single = None, since = None, to = None, from_commit = None, to_commit = None, from_tag = None, to_tag = None, reversed_order = False, only_in_main_branch = False, only_in_branches = None, only_modifications_with_file_types = None, only_no_merge = False): """ Init a repository mining. :param str path_to_repo: absolute path to the repository you have to analyze :param str single: hash of a single commit to analyze :param datetime since: starting date :param datetime to: ending date :param str from_commit: starting commit (only if `since` is None) :param str to_commit: ending commit (only if `to` is None) :param str from_tag: starting the analysis from specified tag (only if `since` and `from_commit` are None) :param str to_tag: ending the analysis from specified tag (only if `to` and `to_commit` are None) :param bool reversed_order: whether the commits should be analyzed in reversed order :param bool only_in_main_branch: whether only commits in main branch should be analyzed :param List[str] only_in_branches: only commits in these branches will be analyzed :param List[str] only_modifications_with_file_types: only modifications with that file types will be analyzed :param bool only_no_merge: if True, merges will not be analyzed """ self.git_repo = GitRepository(path_to_repo) self.single = single self.since = since self.to = to self.reversed_order = reversed_order self.only_in_main_branch = only_in_main_branch self.only_in_branches = only_in_branches self.only_modifications_with_file_types = only_modifications_with_file_types self.only_no_merge = only_no_merge self._check_filters(from_commit, from_tag, since, single, to, to_commit, to_tag) self._check_timezones() def _check_filters(self, from_commit, from_tag, since, single, to, to_commit, to_tag): if single is not None: if since is not None or to is not None or from_commit is not None or \ to_commit is not None or from_tag is not None or to_tag is not None: raise Exception('You can not specify a single commit with other filters') if from_commit is not None: if since is not None: raise Exception('You can not specify both <since date> and <from commit>') self.since = self.git_repo.get_commit(from_commit).author_date if to_commit is not None: if to is not None: raise Exception('You can not specify both <to date> and <to commit>') self.to = self.git_repo.get_commit(to_commit).author_date if from_tag is not None: if since is not None or from_commit is not None: raise Exception('You can not specify <since date> or <from commit> when using <from tag>') self.since = self.git_repo.get_commit_from_tag(from_tag).author_date if to_tag is not None: if to is not None or to_commit is not None: raise Exception('You can not specify <to date> or <to commit> when using <to tag>') self.to = self.git_repo.get_commit_from_tag(to_tag).author_date def traverse_commits(self): """ Analyze all the specified commits (all of them by default), returning a generator of commits. """ logger.info('Git repository in {}'.format(self.git_repo.path)) all_cs = self._apply_filters_on_commits(self.git_repo.get_list_commits()) if not self.reversed_order: all_cs.reverse() for commit in all_cs: logger.info('Commit #{} in {} from {}' .format(commit.hash.encode('utf-8'), commit.author_date, commit.author.name.encode('utf-8'))) if self._is_commit_filtered(commit): logger.info('Commit #{} filtered'.format(commit.hash.encode('utf-8'))) continue yield commit def _is_commit_filtered(self, commit): if self.only_in_main_branch is True and commit.in_main_branch is False: logger.debug('Commit filtered for main branch') return True if self.only_in_branches is not None: logger.debug('Commit filtered for only in branches') if not self._commit_branch_in_branches(commit): return True if self.only_modifications_with_file_types is not None: logger.debug('Commit filtered for modification types') if not self._has_modification_with_file_type(commit): return True if self.only_no_merge is True and commit.merge is True: logger.debug('Commit filtered for no merge') return True return False def _commit_branch_in_branches(self, commit): for branch in commit.branches: if branch in self.only_in_branches: return True return False def _has_modification_with_file_type(self, commit): for mod in commit.modifications: if mod.filename.endswith(tuple(self.only_modifications_with_file_types)): return True return False def _apply_filters_on_commits(self, all_commits): res = [] if self._all_filters_are_none(): return all_commits for commit in all_commits: if self.single is not None and commit.hash == self.single: return [commit] if self.since is None or self.since <= commit.author_date: if self.to is None or commit.author_date <= self.to: res.append(commit) continue return res def _all_filters_are_none(self): return self.single is None and self.since is None and self.to is None def _check_timezones(self): if self.since is not None: if self.since.tzinfo is None or self.since.tzinfo.utcoffset(self.since) is None: self.since = self.since.replace(tzinfo=pytz.utc) if self.to is not None: if self.to.tzinfo is None or self.to.tzinfo.utcoffset(self.to) is None: self.to = self.to.replace(tzinfo=pytz.utc)
def commit_by_msg(repo: GitRepository, msg: str) -> Commit: for commit in repo.get_list_commits(): if commit.msg == msg: return commit raise Exception('cannot find commit with msg {}'.format(msg))
def traverse_commits(self) -> Generator[Commit, None, None]: """ Analyze all the specified commits (all of them by default), returning a generator of commits. """ for path_repo in self._conf.get('path_to_repos'): local_path_repo = path_repo if self._is_remote(path_repo): local_path_repo = self._clone_remote_repo( self._clone_folder(), path_repo) # Get absolute path local_path_repo = str(Path(local_path_repo).expanduser().resolve()) # when multiple repos are given in input, this variable will serve as a reminder # of which one we are currently analyzing self._conf.set_value('path_to_repo', local_path_repo) git_repo = GitRepository(local_path_repo, self._conf) # saving the GitRepository object for further use self._conf.set_value("git_repo", git_repo) # checking that the filters are set correctly self._conf.sanity_check_filters() logger.info('Analyzing git repository in %s', git_repo.path) # Get the commits that modified the filepath. In this case, we can not use # git rev-list since it doesn't have the option --follow, necessary to follow # the renames. Hence, we manually call git log instead if self._conf.get('filepath') is not None: self._conf.set_value( 'filepath_commits', git_repo.get_commits_modified_file( self._conf.get('filepath'))) # Gets only the commits that are tagged if self._conf.get('only_releases'): self._conf.set_value('tagged_commits', git_repo.get_tagged_commits()) # Build the arguments to pass to git rev-list. rev, kwargs = self._conf.build_args() # Iterate over all the commits returned by git rev-list for commit in git_repo.get_list_commits(rev, **kwargs): logger.info('Commit #%s in %s from %s', commit.hash, commit.committer_date, commit.author.name) if self._conf.is_commit_filtered(commit): logger.info('Commit #%s filtered', commit.hash) continue yield commit # cleaning, this is necessary since GitPython issues on memory leaks self._conf.set_value("git_repo", None) git_repo.clear() # delete the temporary directory if created if self._is_remote(path_repo) and self._cleanup is True: assert self._tmp_dir is not None try: self._tmp_dir.cleanup() except PermissionError: # on Windows, Python 3.5, 3.6, 3.7 are not able to delete # git directories because of read-only files. This is now fixed # in python 3.8. In this case, we need to use an # onerror callback to clear the read-only bit. # see https://docs.python.org/3/library/shutil.html?highlight=shutil#rmtree-example def remove_readonly(func, path, _): os.chmod(path, stat.S_IWRITE) func(path) shutil.rmtree(self._tmp_dir.name, onerror=remove_readonly)