예제 #1
0
def test_get_all_commits():
    gr = GitRepository('test-repos/git-1/')
    change_sets = list(gr.get_list_commits())

    assert len(change_sets) == 13
    assert change_sets[0].hash == '866e997a9e44cb4ddd9e00efe49361420aff2559'
    assert change_sets[12].hash == 'e7d13b0511f8a176284ce4f92ed8c6e8d09c77f2'
예제 #2
0
    def traverse_commits(self) -> Generator[Commit, None, None]:
        """
        Analyze all the specified commits (all of them by default), returning
        a generator of commits.
        """

        if isinstance(self._path_to_repo, str):
            self._path_to_repo = [self._path_to_repo]

        for path_repo in self._path_to_repo:
            # if it is a remote repo, clone it first in a temporary folder!
            if self._isremote(path_repo):
                tmp_folder = tempfile.TemporaryDirectory()
                path_repo = self._clone_remote_repos(tmp_folder.name,
                                                     path_repo)

            git_repo = GitRepository(path_repo)

            self._sanity_check_filters(git_repo)
            self._check_timezones()

            logger.info('Analyzing git repository in {}'.format(git_repo.path))

            for commit in git_repo.get_list_commits(self._only_in_branch,
                                                    not self._reversed_order):
                logger.info('Commit #{} in {} from {}'.format(
                    commit.hash, commit.committer_date, commit.author.name))

                if self._is_commit_filtered(commit):
                    logger.info('Commit #{} filtered'.format(commit.hash))
                    continue

                yield commit
예제 #3
0
    def traverse_commits(self) -> Generator[Commit, None, None]:
        """
        Analyze all the specified commits (all of them by default), returning
        a generator of commits.
        """
        for path_repo in self._conf.get('path_to_repos'):
            # if it is a remote repo, clone it first in a temporary folder!
            if self._is_remote(path_repo):
                if self._conf.get('clone_repo_to'):
                    clone_folder = str(Path(self._conf.get('clone_repo_to')))
                    if not os.path.isdir(clone_folder):
                        raise Exception(
                            "Not a directory: {0}".format(clone_folder))
                    path_repo = self._clone_remote_repos(
                        clone_folder, path_repo)
                else:
                    tmp_folder = tempfile.TemporaryDirectory()
                    path_repo = self._clone_remote_repos(
                        tmp_folder.name, path_repo)

            git_repo = GitRepository(path_repo, self._conf)
            self._conf.set_value("git_repo", git_repo)
            self._conf.sanity_check_filters()

            logger.info('Analyzing git repository in %s', git_repo.path)

            # Get the commits that modified the filepath. In this case, we can not use
            # git rev-list since it doesn't have the option --follow, necessary to follow
            # the renames. Hence, we manually call git log instead
            if self._conf.get('filepath') is not None:
                self._conf.set_value(
                    'filepath_commits',
                    git_repo.get_commits_modified_file(
                        self._conf.get('filepath')))

            # Gets only the commits that are tagged
            if self._conf.get('only_releases'):
                self._conf.set_value('tagged_commits',
                                     git_repo.get_tagged_commits())

            # Build the arguments to pass to git rev-list.
            rev, kwargs = self._conf.build_args()

            # Iterate over all the commits returned by git rev-list
            for commit in git_repo.get_list_commits(rev, **kwargs):
                logger.info('Commit #%s in %s from %s', commit.hash,
                            commit.committer_date, commit.author.name)

                if self._conf.is_commit_filtered(commit):
                    logger.info('Commit #%s filtered', commit.hash)
                    continue

                yield commit

            # cleaning, this is necessary since GitPython issues on memory leaks
            self._conf.set_value("git_repo", None)
            git_repo.clear()
예제 #4
0
    def traverse_commits(self) -> Generator[Commit, None, None]:
        """
        Analyze all the specified commits (all of them by default), returning
        a generator of commits.
        """
        for path_repo in self._conf.get('path_to_repos'):
            if self._is_remote(path_repo):
                path_repo = self._clone_remote_repos(self._clone_folder(),
                                                     path_repo)

            git_repo = GitRepository(path_repo, self._conf)
            # saving the GitRepository object for further use
            self._conf.set_value("git_repo", git_repo)

            # when multiple repos are given in input, this variable will serve as a reminder
            # of which one we are currently analyzing
            self._conf.set_value('path_to_repo', path_repo)

            # checking that the filters are set correctly
            self._conf.sanity_check_filters()

            logger.info('Analyzing git repository in %s', git_repo.path)

            # Get the commits that modified the filepath. In this case, we can not use
            # git rev-list since it doesn't have the option --follow, necessary to follow
            # the renames. Hence, we manually call git log instead
            if self._conf.get('filepath') is not None:
                self._conf.set_value(
                    'filepath_commits',
                    git_repo.get_commits_modified_file(
                        self._conf.get('filepath')))

            # Gets only the commits that are tagged
            if self._conf.get('only_releases'):
                self._conf.set_value('tagged_commits',
                                     git_repo.get_tagged_commits())

            # Build the arguments to pass to git rev-list.
            rev, kwargs = self._conf.build_args()

            # Iterate over all the commits returned by git rev-list
            for commit in git_repo.get_list_commits(rev, **kwargs):
                logger.info('Commit #%s in %s from %s', commit.hash,
                            commit.committer_date, commit.author.name)

                if self._conf.is_commit_filtered(commit):
                    logger.info('Commit #%s filtered', commit.hash)
                    continue

                yield commit

            # cleaning, this is necessary since GitPython issues on memory leaks
            self._conf.set_value("git_repo", None)
            git_repo.clear()
예제 #5
0
def test_list_commits(repo: GitRepository):
    change_sets = list(repo.get_list_commits())

    list_commits = {'a88c84ddf42066611e76e6cb690144e5357d132c',
                    '6411e3096dd2070438a17b225f44475136e54e3a',
                    '09f6182cef737db02a085e1d018963c7a29bde5a',
                    '1f99848edadfffa903b8ba1286a935f1b92b2845',
                    'da39b1326dbc2edfe518b90672734a08f3c13458'}

    for commit in change_sets:
        assert commit.hash in list_commits

    assert len(change_sets) == 5
예제 #6
0
    def traverse_commits(self) -> Generator[Commit, None, None]:
        """
        Analyze all the specified commits (all of them by default), returning
        a generator of commits.
        """
        for path_repo in self._conf.get('path_to_repos'):
            # if it is a remote repo, clone it first in a temporary folder!
            if self._is_remote(path_repo):
                if self._conf.get('clone_repo_to'):
                    clone_folder = str(Path(self._conf.get('clone_repo_to')))
                    if not os.path.isdir(clone_folder):
                        raise Exception("Not a directory: " \
                                        "{0}".format(clone_folder))
                    path_repo = self._clone_remote_repos(
                        clone_folder, path_repo)
                else:
                    tmp_folder = tempfile.TemporaryDirectory()
                    path_repo = self._clone_remote_repos(
                        tmp_folder.name, path_repo)

            git_repo = GitRepository(path_repo, self._conf)
            self._conf.set_value("git_repo", git_repo)
            self._conf.sanity_check_filters()

            logger.info('Analyzing git repository in %s', git_repo.path)

            if self._conf.get('filepath') is not None:
                self._conf.set_value(
                    'filepath_commits',
                    git_repo.get_commits_modified_file(
                        self._conf.get('filepath')))

            if self._conf.get('only_releases'):
                self._conf.set_value('tagged_commits',
                                     git_repo.get_tagged_commits())

            for commit in git_repo.get_list_commits(
                    self._conf.get('only_in_branch'),
                    not self._conf.get('reversed_order')):
                logger.info('Commit #%s in %s from %s', commit.hash,
                            commit.committer_date, commit.author.name)

                if self._conf.is_commit_filtered(commit):
                    logger.info('Commit #%s filtered', commit.hash)
                    continue

                yield commit

            # cleaning
            self._conf.set_value("git_repo", None)
            git_repo.clear()
def test_list_commits():
    gr = GitRepository('test-repos/test1/')
    assert gr is not None
    change_sets = gr.get_list_commits()

    list_commits = [
        'a88c84ddf42066611e76e6cb690144e5357d132c',
        '6411e3096dd2070438a17b225f44475136e54e3a',
        '09f6182cef737db02a085e1d018963c7a29bde5a',
        '1f99848edadfffa903b8ba1286a935f1b92b2845',
        'da39b1326dbc2edfe518b90672734a08f3c13458'
    ]

    for commit in change_sets:
        assert commit.hash in list_commits
    assert 5 == len(change_sets)
예제 #8
0
    def traverse_commits(self) -> Generator[Commit, None, None]:
        """
        Analyze all the specified commits (all of them by default), returning
        a generator of commits.
        """
        for path_repo in self._path_to_repo:
            # if it is a remote repo, clone it first in a temporary folder!
            if self._isremote(path_repo):
                tmp_folder = tempfile.TemporaryDirectory()
                path_repo = self._clone_remote_repos(tmp_folder.name,
                                                     path_repo)

            git_repo = GitRepository(path_repo)

            self._sanity_check_filters(git_repo)
            self._check_timezones()

            logger.info('Analyzing git repository in %s', git_repo.path)

            if self._filepath is not None:
                self._filepath_commits = git_repo.get_commits_modified_file(
                    self._filepath)

            if self._only_releases:
                self._tagged_commits = git_repo.get_tagged_commits()

            for commit in git_repo.get_list_commits(self._only_in_branch,
                                                    not self._reversed_order):
                logger.info('Commit #%s in %s from %s', commit.hash,
                            commit.committer_date,
                            commit.author.name)

                if self._is_commit_filtered(commit):
                    logger.info('Commit #%s filtered', commit.hash)
                    continue

                yield commit
class RepositoryMining:
    def __init__(self, path_to_repo,
                 single = None,
                 since = None, to = None,
                 from_commit = None, to_commit = None,
                 from_tag = None, to_tag = None,
                 reversed_order = False,
                 only_in_main_branch = False,
                 only_in_branches = None,
                 only_modifications_with_file_types = None,
                 only_no_merge = False):
        """
        Init a repository mining.

        :param str path_to_repo: absolute path to the repository you have to analyze
        :param str single: hash of a single commit to analyze
        :param datetime since: starting date
        :param datetime to: ending date
        :param str from_commit: starting commit (only if `since` is None)
        :param str to_commit: ending commit (only if `to` is None)
        :param str from_tag: starting the analysis from specified tag (only if `since` and `from_commit` are None)
        :param str to_tag: ending the analysis from specified tag (only if `to` and `to_commit` are None)
        :param bool reversed_order: whether the commits should be analyzed in reversed order
        :param bool only_in_main_branch: whether only commits in main branch should be analyzed
        :param List[str] only_in_branches: only commits in these branches will be analyzed
        :param List[str] only_modifications_with_file_types: only modifications with that file types will be analyzed
        :param bool only_no_merge: if True, merges will not be analyzed
        """
        self.git_repo = GitRepository(path_to_repo)
        self.single = single
        self.since = since
        self.to = to
        self.reversed_order = reversed_order
        self.only_in_main_branch = only_in_main_branch
        self.only_in_branches = only_in_branches
        self.only_modifications_with_file_types = only_modifications_with_file_types
        self.only_no_merge = only_no_merge

        self._check_filters(from_commit, from_tag, since, single, to, to_commit, to_tag)
        self._check_timezones()

    def _check_filters(self, from_commit, from_tag, since, single, to, to_commit, to_tag):
        if single is not None:
            if since is not None or to is not None or from_commit is not None or \
                   to_commit is not None or from_tag is not None or to_tag is not None:
                raise Exception('You can not specify a single commit with other filters')

        if from_commit is not None:
            if since is not None:
                raise Exception('You can not specify both <since date> and <from commit>')
            self.since = self.git_repo.get_commit(from_commit).author_date

        if to_commit is not None:
            if to is not None:
                raise Exception('You can not specify both <to date> and <to commit>')
            self.to = self.git_repo.get_commit(to_commit).author_date

        if from_tag is not None:
            if since is not None or from_commit is not None:
                raise Exception('You can not specify <since date> or <from commit> when using <from tag>')
            self.since = self.git_repo.get_commit_from_tag(from_tag).author_date

        if to_tag is not None:
            if to is not None or to_commit is not None:
                raise Exception('You can not specify <to date> or <to commit> when using <to tag>')
            self.to = self.git_repo.get_commit_from_tag(to_tag).author_date

    def traverse_commits(self):
        """
        Analyze all the specified commits (all of them by default), returning
        a generator of commits.
        """
        logger.info('Git repository in {}'.format(self.git_repo.path))
        all_cs = self._apply_filters_on_commits(self.git_repo.get_list_commits())

        if not self.reversed_order:
            all_cs.reverse()

        for commit in all_cs:
            logger.info('Commit #{} in {} from {}'
                         .format(commit.hash.encode('utf-8'), commit.author_date, commit.author.name.encode('utf-8')))

            if self._is_commit_filtered(commit):
                logger.info('Commit #{} filtered'.format(commit.hash.encode('utf-8')))
                continue

            yield commit

    def _is_commit_filtered(self, commit):
        if self.only_in_main_branch is True and commit.in_main_branch is False:
            logger.debug('Commit filtered for main branch')
            return True
        if self.only_in_branches is not None:
            logger.debug('Commit filtered for only in branches')
            if not self._commit_branch_in_branches(commit):
                return True
        if self.only_modifications_with_file_types is not None:
            logger.debug('Commit filtered for modification types')
            if not self._has_modification_with_file_type(commit):
                return True
        if self.only_no_merge is True and commit.merge is True:
            logger.debug('Commit filtered for no merge')
            return True
        return False

    def _commit_branch_in_branches(self, commit):
        for branch in commit.branches:
            if branch in self.only_in_branches:
                return True
        return False

    def _has_modification_with_file_type(self, commit):
        for mod in commit.modifications:
            if mod.filename.endswith(tuple(self.only_modifications_with_file_types)):
                return True
        return False

    def _apply_filters_on_commits(self, all_commits):
        res = []

        if self._all_filters_are_none():
            return all_commits

        for commit in all_commits:
            if self.single is not None and commit.hash == self.single:
                return [commit]
            if self.since is None or self.since <= commit.author_date:
                if self.to is None or commit.author_date <= self.to:
                    res.append(commit)
                    continue
        return res

    def _all_filters_are_none(self):
        return self.single is None and self.since is None and self.to is None

    def _check_timezones(self):
        if self.since is not None:
            if self.since.tzinfo is None or self.since.tzinfo.utcoffset(self.since) is None:
                self.since = self.since.replace(tzinfo=pytz.utc)
        if self.to is not None:
            if self.to.tzinfo is None or self.to.tzinfo.utcoffset(self.to) is None:
                self.to = self.to.replace(tzinfo=pytz.utc)
예제 #10
0
def commit_by_msg(repo: GitRepository, msg: str) -> Commit:
    for commit in repo.get_list_commits():
        if commit.msg == msg:
            return commit
    raise Exception('cannot find commit with msg {}'.format(msg))
예제 #11
0
    def traverse_commits(self) -> Generator[Commit, None, None]:
        """
        Analyze all the specified commits (all of them by default), returning
        a generator of commits.
        """
        for path_repo in self._conf.get('path_to_repos'):
            local_path_repo = path_repo
            if self._is_remote(path_repo):
                local_path_repo = self._clone_remote_repo(
                    self._clone_folder(), path_repo)

            # Get absolute path
            local_path_repo = str(Path(local_path_repo).expanduser().resolve())

            # when multiple repos are given in input, this variable will serve as a reminder
            # of which one we are currently analyzing
            self._conf.set_value('path_to_repo', local_path_repo)

            git_repo = GitRepository(local_path_repo, self._conf)
            # saving the GitRepository object for further use
            self._conf.set_value("git_repo", git_repo)

            # checking that the filters are set correctly
            self._conf.sanity_check_filters()

            logger.info('Analyzing git repository in %s', git_repo.path)

            # Get the commits that modified the filepath. In this case, we can not use
            # git rev-list since it doesn't have the option --follow, necessary to follow
            # the renames. Hence, we manually call git log instead
            if self._conf.get('filepath') is not None:
                self._conf.set_value(
                    'filepath_commits',
                    git_repo.get_commits_modified_file(
                        self._conf.get('filepath')))

            # Gets only the commits that are tagged
            if self._conf.get('only_releases'):
                self._conf.set_value('tagged_commits',
                                     git_repo.get_tagged_commits())

            # Build the arguments to pass to git rev-list.
            rev, kwargs = self._conf.build_args()

            # Iterate over all the commits returned by git rev-list
            for commit in git_repo.get_list_commits(rev, **kwargs):
                logger.info('Commit #%s in %s from %s', commit.hash,
                            commit.committer_date, commit.author.name)

                if self._conf.is_commit_filtered(commit):
                    logger.info('Commit #%s filtered', commit.hash)
                    continue

                yield commit

            # cleaning, this is necessary since GitPython issues on memory leaks
            self._conf.set_value("git_repo", None)
            git_repo.clear()

            # delete the temporary directory if created
            if self._is_remote(path_repo) and self._cleanup is True:
                assert self._tmp_dir is not None
                try:
                    self._tmp_dir.cleanup()
                except PermissionError:
                    # on Windows, Python 3.5, 3.6, 3.7 are not able to delete
                    # git directories because of read-only files. This is now fixed
                    # in python 3.8. In this case, we need to use an
                    # onerror callback to clear the read-only bit.
                    # see https://docs.python.org/3/library/shutil.html?highlight=shutil#rmtree-example
                    def remove_readonly(func, path, _):
                        os.chmod(path, stat.S_IWRITE)
                        func(path)

                    shutil.rmtree(self._tmp_dir.name, onerror=remove_readonly)