示例#1
0
def test_extract_edits_1(git_repo_dir):
    commit_hash = 'b17c2c321ce8d299de3d063ca0a1b0b363477505'
    filename = 'first_lines.txt'

    extraction_settings = {
        'use_blocks': False,
        'blame_C': 'CCC4',
        'extract_complexity': True,
        'extract_text': True
    }

    git_repo = pydriller.GitRepository(git_repo_dir)
    commit = git_repo.get_commit(commit_hash)
    for mod in commit.modifications:
        if mod.filename == filename:
            df = git2net.extraction._extract_edits(git_repo, commit, mod,
                                                   extraction_settings)

    assert len(df) == 3
    assert df.at[
        0,
        'original_commit_addition'] == 'e4448e87541d19d139b9d033b2578941a53d1f97'
    assert df.at[
        1,
        'original_commit_addition'] == '6b531fcb57d5b9d98dd983cb65357d82ccca647b'
    assert df.at[
        2,
        'original_commit_addition'] == None  # as there is no match due to line ending
示例#2
0
def test_extract_edits_2(repo_string):
    commit_hash = 'b17c2c321ce8d299de3d063ca0a1b0b363477505'
    filename = 'first_lines.txt'

    git_repo = pydriller.GitRepository(repo_string)
    commit = git_repo.get_commit(commit_hash)
    df = None
    for mod in commit.modifications:
        if mod.filename == filename:
            df = git2net.extraction._extract_edits(git_repo,
                                                   commit,
                                                   mod,
                                                   use_blocks=True,
                                                   blame_C='CCC4')
    assert len(df) == 1
    assert df.at[0,
                 'original_commit_addition'] == 'not available with use_blocks'
示例#3
0
def test_extract_edits_2(git_repo_dir):
    commit_hash = 'b17c2c321ce8d299de3d063ca0a1b0b363477505'
    filename = 'first_lines.txt'

    extraction_settings = {
        'use_blocks': True,
        'blame_C': 'CCC4',
        'extract_complexity': True,
        'extract_text': True
    }

    git_repo = pydriller.GitRepository(git_repo_dir)
    commit = git_repo.get_commit(commit_hash)
    df = None
    for mod in commit.modifications:
        if mod.filename == filename:
            df = git2net.extraction._extract_edits(git_repo, commit, mod,
                                                   extraction_settings)
    assert len(df) == 1
    assert df.at[0,
                 'original_commit_addition'] == 'not available with use_blocks'
示例#4
0
def test_identify_edits(repo_string):
    commit_hash = 'f343ed53ee64717f85135c4b8d3f6bd018be80ad'
    filename = 'text_file.txt'

    git_repo = pydriller.GitRepository(repo_string)
    commit = git_repo.get_commit(commit_hash)
    for x in commit.modifications:
        if x.filename == filename:
            mod = x

    parsed_lines = git_repo.parse_diff(mod.diff)

    deleted_lines = {x[0]: x[1] for x in parsed_lines['deleted']}
    added_lines = {x[0]: x[1] for x in parsed_lines['added']}

    _, edits = git2net.extraction._identify_edits(deleted_lines,
                                                  added_lines,
                                                  use_blocks=False)
    assert list(edits.type) == [
        'deletion', 'replacement', 'deletion', 'replacement', 'addition',
        'addition', 'addition'
    ]
示例#5
0
def test_extract_edits_1(repo_string):
    commit_hash = 'b17c2c321ce8d299de3d063ca0a1b0b363477505'
    filename = 'first_lines.txt'

    git_repo = pydriller.GitRepository(repo_string)
    commit = git_repo.get_commit(commit_hash)
    for mod in commit.modifications:
        if mod.filename == filename:
            df = git2net.extraction._extract_edits(git_repo,
                                                   commit,
                                                   mod,
                                                   use_blocks=False,
                                                   blame_C='CCC4')
    assert len(df) == 3
    assert df.at[
        0,
        'original_commit_addition'] == 'e4448e87541d19d139b9d033b2578941a53d1f97'
    assert df.at[
        1,
        'original_commit_addition'] == '6b531fcb57d5b9d98dd983cb65357d82ccca647b'
    assert df.at[
        2,
        'original_commit_addition'] == 'e4448e87541d19d139b9d033b2578941a53d1f97'
示例#6
0
    def compileAuthors(self):
        """Mine all repos in the repo list for commits by those in authors. None for get all"""
        print("mining " + self.name)

        reposToMine = []
        for repo in self.repos:
            if repo not in self.minedRepos:
                reposToMine.append(repo)

        for repo in reposToMine:
            try:
                if not os.path.exists(
                        repo + "/.git"):  #in case the repo is one level down
                    repo = repo + "/" + os.listdir(repo)[0]
                    #print("moved to "+repo)
                if repo in self.minedRepos:
                    continue

                self.minedRepos.add(repo)
                #if not os.path.exists(repo+".git"):
                #    repo = os.listdir(repo)[0]

                remote = self.get_remote(repo)
                created = remote.created_at

                if self.authorsToMine:
                    commitsToMine = []
                    for authorName in self.authorsToMine:
                        for commit in remote.get_commits(author=authorName):
                            if commit.sha not in self.commits:
                                commitsToMine.append(commit.sha)

                    if not commitsToMine:
                        print("No important commits here, skipping " + repo)
                        continue
                    miner = pydriller.repository_mining.RepositoryMining(
                        repo,
                        only_modifications_with_file_types=gitProfileSet.
                        langList,
                        only_no_merge=True,
                        only_commits=commitsToMine,
                        since=created)
                else:
                    miner = pydriller.repository_mining.RepositoryMining(
                        repo,
                        only_modifications_with_file_types=gitProfileSet.
                        langList,
                        only_no_merge=True,
                        since=created)
                repository = pydriller.GitRepository(repo)

                print("Scanning repo: " + miner._path_to_repo)

                for commit in tqdm(miner.traverse_commits()):
                    try:
                        author = commit.author

                        if author.name not in self.aliases:
                            ghCommit = remote.get_commit(commit.hash)
                            namedUser = ghCommit.author
                            if not namedUser:
                                continue
                            if namedUser.login not in self.authors:
                                self.authors[namedUser.login] = gitAuthor(
                                    namedUser)
                            self.aliases[author.name] = namedUser.login

                        author = self.authors[self.aliases[author.name]]

                        if self.authorsToMine and author.name not in self.authorsToMine:
                            continue

                        if commit.hash in author.commits or commit.hash in self.commits:
                            continue  #don't reprocess seen hashes

                        self.commits.add(commit.hash)
                        author.commits.add(commit.hash)

                        if repo not in author.repos:
                            author.repos.add(repo)

                        for mod in commit.modifications:
                            mod._calculate_metrics()
                            if mod.new_path is None or not mod.new_path.split(
                                    ".")[-1] in gitProfileSet.langList:
                                continue

                            author.files.add(mod.new_path)
                            #parse diff and add lines to list

                            newSC = list()
                            leDiff = repository.parse_diff(mod.diff)
                            for num, line in leDiff["added"]:
                                newSC.append(line)

                            from lizard import analyze_file as liz
                            fileInfo = liz.analyze_source_code(
                                mod.new_path, "\n".join(newSC))

                            #maintain list of dicts containing the source code of specific functions. Same format as for lines
                            lineIndex = 0

                            for fun in fileInfo.function_list:
                                #Make sure these appear in the "function"
                                arg_list_termination = r"\)\s*{"
                                started = False
                                newFun = dict()
                                lineStr = ""
                                try:
                                    while (leDiff["added"][lineIndex][0] <
                                           fun.start_line):
                                        lineIndex += 1

                                    while (leDiff["added"][lineIndex][0] <
                                           fun.end_line + 1):
                                        last_lineStr = lineStr
                                        lineStr = leDiff["added"][lineIndex][1]

                                        if not started and re.search(
                                                arg_list_termination, "".join(
                                                    [lineStr, last_lineStr])):
                                            started = True

                                        newFun.update({
                                            (commit.hash, mod.new_path, leDiff["added"][lineIndex][0]):
                                            lineStr
                                        })
                                        lineIndex += 1
                                except IndexError:  #if end of input reached before end of functions. This is probable when non-complete functions are submitted.
                                    pass

                                if started and len(
                                        newFun
                                ) > 1 and '}' in lineStr + last_lineStr:
                                    author.lines.update(newFun)
                                    author.functions.append(
                                        self.functionToString(newFun))
                    except Exception as e:
                        continue
            except Exception as e:
                print("problem processing " + repo)
                continue
            except KeyboardInterrupt:
                print("continuing")
                continue

            self.minedRepos.add(repo)
            print(str("finished" + str(miner._path_to_repo)))
            print(self)
        self.repos = self.minedRepos
def get_repo(repo_path):
    return pydriller.GitRepository(repo_path)
示例#8
0
                        help='Only include vulnerabilities of given severity level or higher '
                             '(Default: NONE, include all)')
    parser.add_argument('--confidence', type=Level.from_string, choices=list(Level), default=Level.NONE,
                        help='Only include vulnerabilities of given confidence level or higher '
                             '(Default: NONE, include all)')
    parser.add_argument('--no-merge', action='store_true', help='Do not include merge commits')
    args = parser.parse_args()

    try:
        # Clone the repo if URL is provided
        self = pd.RepositoryMining(args.repo)
        if self._isremote(args.repo):
            tmp_folder = tempfile.TemporaryDirectory()
            args.repo = self._clone_remote_repos(tmp_folder.name, args.repo)

        repo = pd.GitRepository(args.repo)

        args.single = repo.get_commit(args.single).hash if args.single else None
        args.since = datetime.strptime(args.since, '%Y/%m/%d') if args.since else None
        args.to = datetime.strptime(args.to, '%Y/%m/%d') if args.to else None
        args.from_commit = repo.get_commit(args.from_commit).hash if args.from_commit else None
        args.to_commit = repo.get_commit(args.to_commit).hash if args.to_commit else None

        repo_mining = pd.RepositoryMining(args.repo,
                                          single=args.single,
                                          since=args.since,
                                          to=args.to,
                                          from_commit=args.from_commit,
                                          to_commit=args.to_commit,
                                          from_tag=args.from_tag,
                                          to_tag=args.to_tag,