Exemplo n.º 1
0
from pydriller import RepositoryMining, GitRepository
import datetime
import re

#listaCommits = getLinuxCommits()
pathLinux = '../../journal/repositories/linux'
fileLinuxFeatures = open('output/featuresLinux.csv', 'w')

featuresLinux = []
for commit in RepositoryMining(
        pathLinux,
        only_modifications_with_file_types=['kconfig']).traverse_commits():
    for modification in commit.modifications:
        if ('kconfig' in modification.filename.lower()
                and modification.change_type.value == 5):
            currentSourceCode = modification.source_code.replace(
                '\t', '').strip().split('\n')
            for line in currentSourceCode:
                res = re.match(r'^config \S+', line)
                if ((res != None) and not (line.split()[1] in featuresLinux)):
                    featuresLinux.append(line.split()[1])
                    fileLinuxFeatures.write('{}\n'.format(line.split()[1]))
                    print(line)
Exemplo n.º 2
0
def extractBuggyCommits(input_filename, local_repos_directory,
                        output_directory):

    projects = ProjectLoader.getReposPlainName(input_filename)

    print(projects)

    for projectName in projects:

        bug_counter = 0
        bugFixes = []

        print("Analyzing", projectName)

        if (projectName == 'pytorch' or projectName == 'react-native'):
            continue

        if (os.path.exists(
                str(output_directory) + "/" + str(projectName) +
                "_bug_fixing_commits") and os.path.isfile(
                    str(output_directory) + "/" + str(projectName) +
                    "_bug_fixing_commits")):
            print(projectName, "already analyzed, skipping...")
            continue

        # 1. iterate over each project
        # 2. find all commits that fixed bugs using syntactic analysis
        # 3. find the commit that caused the bug
        # 4. for the commit that caused the bug, extract how many files were in the change set, the number of lines changed (added or removed), the author, how many commits the file has had to it, the number of contributors that have contributed to the file, contributor experience which returns the percentage of
        #    lines authored by the highest contributor of a file, the hunks count

        # commits count, contributor count and contributor experience are in process metrics

        startTime = time.time()

        for commit in RepositoryMining(
                local_repos_directory + "/" + str(projectName),
                only_in_branch='master',
                only_no_merge=True,
                since=datetime.datetime(2019, 6, 1, 0, 0,
                                        0)).traverse_commits():
            commit_msg = commit.msg
            containsBug = 'bug' in commit_msg.casefold()
            containsPatch = 'patch' in commit_msg.casefold()
            containsFix = 'fix' in commit_msg.casefold()
            containsBugIdentifier = bool(re.search('#+\d', commit_msg))
            if (containsBug and
                (containsFix or containsPatch or containsBugIdentifier)) or (
                    containsFix and containsBugIdentifier):

                bug_counter = bug_counter + 1

                # get the list of modified files in the fix
                listFixedFiles = commit.modifications

                numFilesModifiedForFix = 0

                numLinesAddedForFix = 0
                numLinesRemovedForFix = 0
                totalComplexityFixedFiles = 0

                fileComplexityCount = 0
                averageComplexityFixedFiles = -1

                totalLinesOfCodeAllFiles = 0

                changedMethods = 0

                numFilesMoved = 0

                for file in listFixedFiles:

                    sourceCodeLanguage = LanguageDetector.detect(file.filename)

                    if (sourceCodeLanguage == None or file.nloc == 0): continue

                    if (file.nloc):
                        totalLinesOfCodeAllFiles = totalLinesOfCodeAllFiles + file.nloc

                    numFilesModifiedForFix = numFilesModifiedForFix + 1

                    numLinesAddedForFix = numLinesAddedForFix + file.added
                    numLinesRemovedForFix = numLinesRemovedForFix + file.removed
                    if file.complexity:
                        fileComplexityCount = fileComplexityCount + 1
                        totalComplexityFixedFiles = totalComplexityFixedFiles + file.complexity

                    changedMethods = changedMethods + len(file.changed_methods)

                if (numFilesModifiedForFix == 0): continue

                if (fileComplexityCount != 0):
                    averageComplexityFixedFiles = totalComplexityFixedFiles / fileComplexityCount

                bugFixInfo = {
                    "commit_hash": commit.hash,
                    "author": commit.author.name,
                    "total_complexity": totalComplexityFixedFiles,
                    "average_complexity": averageComplexityFixedFiles,
                    "sum_nloc": totalLinesOfCodeAllFiles,
                    "num_files": numFilesModifiedForFix,
                    "lines_added": numLinesAddedForFix,
                    "lines_removed": numLinesRemovedForFix,
                    "commit_date": commit.author_date,
                    "branches": commit.branches,
                    "num_methods_changed": changedMethods
                }

                bugFixes.append(bugFixInfo)

            tempMap = {projectName: bugFixes}

            IOUtils.writeBugMap(tempMap, output_directory,
                                "_bug_fixing_commits")

        endTime = time.time()

        print("time", endTime - startTime)
Exemplo n.º 3
0
from RepositoryMiner import RepositoryMiner
from dataStructures.Repository_summary import RepositorySummary
from pydriller import RepositoryMining

repo_summary = RepositorySummary()
repoMiner = RepositoryMiner(
    RepositoryMining(path_to_repo="~/Projects/focus-android",
                     from_commit=None,
                     to_commit=None), repo_summary)
repoMiner.create_repository_summary(
    "~/Projects/focus-android", "2d4dc678ce1260b90d3499ebefcdcaf19549f983",
    None)
repoMiner.save_table_as_csv("first_try.csv")
Exemplo n.º 4
0
                                data[r][c]["files"][f] = {}
                            if not "changes" in data[r][c]["files"][f]:
                                data[r][c]["files"][f]["changes"] = []
                            data[r][c]["files"][f]["changes"].append(
                                thischange)
                            changesfromdiff = True
                            changeCommits.append(c)

        if changesfromdiff:
            #if any changes in this diff were useful...we get the sourcecode for those files using pydriller
            print("\n\n" + mode + "    mining " + r + " " + str(progress) +
                  "/" + str(len(data)))

            commitlist = []
            try:
                for commit in RepositoryMining(r).traverse_commits():
                    commitlist.append(commit.hash)

                    #go through all commits in the repository mining and check if they match with one of the commits that are of interest
                    if not commit.hash in changeCommits:
                        continue

                    for m in commit.modifications:
                        #run through all modifications in the single commit in the repository mining
                        if m.old_path != None and m.source_code_before != None:
                            if not ".py" in m.old_path:
                                continue

                            #ignore files that are too large
                            if len(m.source_code_before) > 30000:
                                continue
Exemplo n.º 5
0
def main():
    repo_path = sys.argv[1]
    repo_branch = 'master'

    commits = RepositoryMining(repo_path,
                               only_in_branch=repo_branch).traverse_commits()
    commits = [commit for commit in commits]

    gitRepo = GitRepository(repo_path)

    start_date = commits[0].committer_date + relativedelta(years=3)
    last_date = commits[-1].committer_date - relativedelta(years=3)

    bug_tracker = defaultdict(list)
    bug_tracker_pickle = "data3/{}.pickle".format(
        os.path.basename(os.path.normpath(repo_path)))

    # First index the buggy files
    if os.path.exists(bug_tracker_pickle):
        with open(bug_tracker_pickle, 'rb') as handle:
            bug_tracker = pickle.load(handle)
    else:
        for commit_index, commit in enumerate(commits):
            if not is_bugfix_commit(commit.msg):
                continue

            try:
                for m in commit.modifications:
                    if not valid_source_file(m.filename):
                        continue

                    bug_commit = gitRepo.get_commits_last_modified_lines(
                        commit, m)  ### uses SZZ
                    # if bug_commit == {}: continue

                    bug_start_index = 99999999999999999999
                    for _file in bug_commit:
                        for i, _commit in enumerate(commits[:commit_index]):
                            if _commit.hash in bug_commit[_file] \
                                and i<bug_start_index:
                                bug_start_index = i

                    for _commit in commits[bug_start_index:commit_index]:
                        bug_tracker[_commit.hash].append(m.filename)
            except Exception as e:
                print("[***]", e)
                print(traceback.format_exc())
                print("Continuing for next commits")

            print(len(bug_tracker.keys()))
        with open(bug_tracker_pickle, 'wb') as handle:
            pickle.dump(bug_tracker, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Copy the files
    with open('maj_versions/{}.hash'.format(
            os.path.basename(os.path.normpath(repo_path)))) as f:
        major_releases = []
        for line in f.read().splitlines():
            tag, hash = line.split(',')
            major_releases.append((tag, hash))

    for version, commit in enumerate(commits):
        if not commit.hash in [item[1] for item in major_releases]:
            continue

        if commit.committer_date < start_date or commit.committer_date > last_date:
            continue

        for tag, hash in major_releases:
            if hash == commit.hash:
                break

        print("[*] Doing {}".format(tag))
        gitRepo.checkout(commit.hash)

        base_dir_not_bug = "data3/{}/{}/not_bug".format(
            os.path.basename(os.path.normpath(repo_path)), tag)
        base_dir_bug = "data3/{}/{}/bug".format(
            os.path.basename(os.path.normpath(repo_path)), tag)
        if not os.path.exists(base_dir_bug):
            os.makedirs(base_dir_bug)
        if not os.path.exists(base_dir_not_bug):
            os.makedirs(base_dir_not_bug)

        all_files = gitRepo.files()

        for _file in all_files:
            if not valid_source_file(_file):
                continue

            filename = os.path.basename(os.path.normpath(_file))
            if commit.hash in bug_tracker and filename in bug_tracker[
                    commit.hash]:
                file_path_to_write = os.path.join(base_dir_bug, filename)
            else:
                file_path_to_write = os.path.join(base_dir_not_bug, filename)

            shutil.copyfile(_file, file_path_to_write)

    print("All Done!")
Exemplo n.º 6
0
def repo_to(request):
    path, to = request.param
    return list(RepositoryMining(path_to_repo=path, to=to).traverse_commits())
Exemplo n.º 7
0
def test_no_url():
    with pytest.raises(Exception):
        list(RepositoryMining().traverse_commits())
from pydriller import RepositoryMining

for commit in RepositoryMining('../test-repos/test1/').traverse_commits():
    print('hash {} authored by {}'.format(commit.hash, commit.author.name))
    def whole_evolution_with_try_except_tracking(self, repository, topic):
        print(f"Analysing repo ... {repository} in {topic}")
        commits_with_code_smells_dict = {}
        total_number_of_commits = 0
        try:
            for commit in RepositoryMining(f"https://github.com/{repository}.git",
                                           only_modifications_with_file_types=['.py']).traverse_commits():
                total_number_of_commits += 1

                for modification in commit.modifications:
                    if ".py" in str(modification.filename):
                        source_code = modification.source_code
                        ## can be _None_ if the file is added
                        if modification.old_path is None:
                            file_path = modification.new_path
                        else:
                            file_path = modification.old_path

                    else:
                        continue

                    try_excepts = ExceptionHandler().find_exception_handler_patterns(source_code, commit)
                    """
                  
                    try:
                        a = ast.parse(source_code)
                    except SyntaxError as e:
                        continue
                    except ValueError as e:
                        continue

                    v = TryVisitor()
                    v.visit(a)
                    """
                    if try_excepts is None:
                        continue

                    if len(try_excepts) == 0:
                        continue

                    # if not "zeeguu/model/user.py" in file:
                    #    continue
                    """
                    If none, the occurrence will be added as commits_with_code_smells_dict[file] = [code_smell]
                    """
                    if commits_with_code_smells_dict.get(file_path) is None:
                        for eh in try_excepts:
                            eh.author = commit.author.name
                            if eh.robustness_exception_handling:
                                eh.robustness_added_or_removed = "added"

                            if eh.any_exception_smell:
                                eh.exception_smell_added_or_removed = "added"

                        commits_with_code_smells_dict[file_path] = [dict(
                            {'date': str(commit.committer_date), 'exception_handlers': try_excepts})]
                        continue

                    if commits_with_code_smells_dict.get(file_path) is not None:
                        handler_changes = False

                        for eh in try_excepts:
                            eh.author = commit.author.name
                        incomings = []
                        current_exception_handlers = commits_with_code_smells_dict.get(file_path)[-1][
                            "exception_handlers"]

                        new_current_list_buffer = []

                        if len(try_excepts) > len(current_exception_handlers):
                            # del new_incoming_list[i]
                            # search for nearest "number"
                            # old_list_lines = [x.lineno for x in current_exception_handlers]
                            # new_list_lines = [x.lineno for x in new_incoming_list]
                            commits_with_code_smells_dict.get(file_path).append(dict(
                                {'date': str(commit.committer_date),
                                 'exception_handlers': try_excepts}))

                            continue

                            """
                            for file_diff_lineno_key, lineno in modification.diff_parsed.get("added"):
                                for i, current_h in enumerate(new_incoming_list):
                                    if current_h.lineno == file_diff_lineno_key:
                                        new_incoming_list_buffer.append(current_h)
                            """
                            """
                            for i, new_change in enumerate(new_incoming_list):

                                for old_change in current_exception_handlers:

                                    closest_number = old_list_lines[
                                        min(range(len(old_list_lines)), key=lambda i: abs(
                                            old_list_lines[i] - new_change.lineno))]

                                    if old_change.lineno == closest_number:
                                        handler_changes, newest_change, = self.process_changes(old_change,
                                                                                               handler_changes,
                                                                                               new_change)
                                        for i in range(len(new_incoming_list)):
                                            if new_incoming_list[i].lineno == newest_change.lineno:
                                                new_incoming_list[i] = newest_change
                                                continue
                            """

                        if len(try_excepts) < len(current_exception_handlers):
                            for file_diff_lineno_key, lineno in modification.diff_parsed.get("deleted"):
                                for current_hnew in current_exception_handlers:
                                    if current_hnew.lineno == file_diff_lineno_key:
                                        new_current_list_buffer.append(current_hnew)

                            commits_with_code_smells_dict.get(file_path).append(dict(
                                {'date': str(commit.committer_date),
                                 'exception_handlers': try_excepts,
                                 'removed': new_current_list_buffer}))
                            continue

                        for (current, incoming) in zip(current_exception_handlers,
                                                       try_excepts):
                            handler_changes, newest_change, = self.process_changes(current, handler_changes, incoming)

                            incomings.append(newest_change)

                        if handler_changes:
                            commits_with_code_smells_dict.get(file_path).append(dict(
                                {'date': str(commit.committer_date),
                                 'exception_handlers': incomings}))

        except Exception as e:
            print(e)

        repo_name = repository.replace("/", "_")
        path_to_results = f'topic_analysis_results/{topic}'
        if not os.path.exists(path_to_results):
            os.makedirs(path_to_results)
        filename = f"{path_to_results}/{repo_name}_result.json"
        finaldict = {'repo': repository, 'total_commits': total_number_of_commits}
        finaldict.update(commits_with_code_smells_dict)
        with open(filename, "w") as result_file:
            result_file.write(json.dumps(finaldict, indent=4, sort_keys=False, default=lambda x: x.__dict__))
Exemplo n.º 10
0
def test_projectname_multiple_repos_remote():
    repos = ['https://github.com/ishepard/pydriller', 'test-repos/pydriller']
    for commit in RepositoryMining(path_to_repo=repos).traverse_commits():
        assert commit.project_name == 'pydriller'
Exemplo n.º 11
0
def repo_to(path, to):
    return list(RepositoryMining(path_to_repo=path, to=to).traverse_commits())
Exemplo n.º 12
0
                        newline='')

        # with csvfile1:
        #     writer = csv.writer(csvfile1, delimiter=',')
        #     writer.writerow((commit, date_time, elemet_file, 0, 0, 0, 0, 0, 0, 0, 0, 0))
        # csvfile1.close()

    # wbk.close

    return row


count = 0

os.chdir(pathDirectory)
for commit in RepositoryMining(pathDirectory,
                               from_commit=start_commit).traverse_commits():
    count += 1
    print("Commit :", count)

    cmd_Checkout = "git checkout " + commit.hash + " -f"
    # print(cmd_Checkout)
    # print('CLEAN')
    # os.system("git reset --hard")
    print('Checkout !!!!!')
    subprocess.check_output(cmd_Checkout, shell=True)
    # os.system(cmd_Checkout)

#     json_file = {
#         'name': "D:/Projects/AnalysteProject/" + ApplicationName + "/Analyse_" + ApplicationName + ".csv",
#         'row': row,
#         'commit': commit.hash,
Exemplo n.º 13
0
from pydriller import RepositoryMining
import csv
import shutil

filename = "commitData.csv"
csv_writer = csv.writer(open(filename, 'w'))
csv_writer.writerow([
    "projectID", "commitHash", "commitMessage", "author", "authorDate",
    "authorTimezone", "committeer", "committeerDate", "committeerTimezone",
    "branches", "inMainBranch", "merge", "parents"
])
for commit in RepositoryMining('../usr/src').traverse_commits():

    projectName = commit.project_name
    commitHash = commit.hash
    message = commit.msg

    author = commit.author.name
    date = commit.author_date
    timezone = commit.author_timezone

    committeer = commit.committer.name
    committeerDate = commit.committer_date
    committeerTimezone = commit.committer_timezone

    branches = commit.branches
    inMainBranch = commit.in_main_branch

    merge = commit.merge
    parents = commit.parents
Exemplo n.º 14
0
from pydriller import RepositoryMining
import json
from tqdm import tqdm

author_dict = dict()
for commit in tqdm(
        RepositoryMining('F:\GitHub\elasticsearch').traverse_commits()):
    if commit.author.email not in author_dict:
        author_dict[commit.author.email] = dict()
        author_dict[commit.author.email]['name'] = str(commit.author.name)
        author_dict[commit.author.email]['cmt'] = 0
        author_dict[commit.author.email]['add'] = 0
        author_dict[commit.author.email]['del'] = 0
        author_dict[commit.author.email]['fixes'] = 0
        author_dict[commit.author.email]['files'] = 0
    author_dict[commit.author.email]['cmt'] += 1
    if 'fix' in commit.msg:
        author_dict[commit.author.email]['fixes'] += 1
    for mod in commit.modifications:
        author_dict[commit.author.email]['add'] += mod.added
        author_dict[commit.author.email]['del'] += mod.removed
        author_dict[commit.author.email]['files'] += 1

filename = 'Pydriller.json'
with open(filename, 'w') as file_obj:
    json.dump(author_dict, file_obj)
Exemplo n.º 15
0
def test_2_identical_local_urls():
    urls = ["test-repos/test1", "test-repos/test1"]
    assert 10 == len(
        list(RepositoryMining(path_to_repo=urls).traverse_commits()))
Exemplo n.º 16
0
class RepositoryProcessor:
    def __init__(self, repository: str, owner: str):
        self.owner = owner
        self.repository = os.path.split(repository)[-1]
        self.repo = GitRepository(repository)
        self.mining = RepositoryMining(repository)
        self.pairs = []
        random.seed(42)

    def run(self):
        self.get_all_filepairs()
        with open(os.path.join('filepairs', self.repository, 'pairs.txt'),
                  'w') as f:
            f.write('\n'.join(
                map(lambda x: f'{x[0]} {x[1]} {x[2]}', self.pairs)))
        f.write('\n')

    def get_all_filepairs(self, file_filter=java_file_filter):
        commits = list(
            filter(lambda x: not x.merge, self.mining.traverse_commits()))
        for commit in commits:
            for modification in commit.modifications:
                if modification.change_type == ModificationType.MODIFY:
                    if file_filter(modification.filename):
                        self.get_file_pair(commit, modification)

    def get_file_pair(self, commit, modification: Modification):
        parent = commit.parents[0]

        repo = self.repo.project_name
        commit_hash = commit.hash
        filename = modification.filename

        path = os.path.join('filepairs', repo, commit_hash, filename)
        os.makedirs(path, exist_ok=True)

        self.repo.checkout(parent)
        before = os.path.join(self.repository, modification.old_path)
        before_saved = os.path.join(path,
                                    'before_' + commit_hash + '_' + filename)
        copyfile(before, before_saved)

        self.repo.checkout(commit_hash)
        after = os.path.join(self.repository, modification.new_path)
        after_saved = os.path.join(path,
                                   'after__' + commit_hash + '_' + filename)
        copyfile(after, after_saved)

        self.pairs.append(
            (before_saved, after_saved,
             commit_hash + '.' + self.owner + '.' + before.replace('/', '.')))

    def run_random(self, number):
        self.get_random_filepairs(number)
        with open(os.path.join('filepairs', self.repository, 'pairs.txt'),
                  'w') as f:
            f.write('\n'.join(
                map(lambda x: f'{x[0]} {x[1]} {x[2]}', self.pairs)))
            f.write('\n')

    def get_random_filepairs(self, number, file_filter=java_file_filter):
        commits = random.choices(list(
            filter(lambda x: not x.merge, self.mining.traverse_commits())),
                                 k=number)
        for idx, commit in enumerate(commits):
            print(f'Processing commit №{idx}: {commit.hash}')
            for modification in commit.modifications:
                if modification.change_type == ModificationType.MODIFY:
                    if file_filter(modification.filename):
                        self.get_file_pair(commit, modification)
Exemplo n.º 17
0
def repo(request):
    return list(RepositoryMining(path_to_repo=request.param).traverse_commits())
from pydriller import RepositoryMining
from pydriller.domain.commit import ModificationType

repo = '/Users/luca/TUProjects/Salerno/jpacman-framework'
start = 'f3178b8'
stop = '51f041d'

files = {}
for commit in RepositoryMining(repo, from_commit=start,
                               to_commit=stop).traverse_commits():
    for mod in commit.modifications:
        if mod.filename.endswith(
                '.java') and mod.change_type is not ModificationType.DELETE:
            process_metrics = {
                'change': mod.change_type,
                'added': mod.added,
                'removed': mod.removed,
                'loc': mod.nloc,
                'comp': mod.complexity
            }

            path = mod.new_path
            if path not in files:
                files[path] = []
            files.get(path, []).append(process_metrics)

output = open('output.csv', 'w')
output.write('file,n-changes,added,removed,loc,complexity\n'.format())

for key, value in files.items():
    n_changes = len(value)
Exemplo n.º 19
0
def test_clone_repo_to_not_existing():
    with pytest.raises(Exception):
        list(RepositoryMining("https://github.com/ishepard/pydriller",
                              clone_repo_to="NOTEXISTINGDIR").traverse_commits())
def store_commit_data(git_directory_path, devranker_dir, output_file_path,
                      str_from_date, str_to_date):
    # Why 'set_start_method("spawn")'?
    # Because getting Multiple windows unnecessarily and window became unresponsive after Mining is done
    # Ref: https://pythonspeed.com/articles/python-multiprocessing/
    mp.set_start_method("spawn")

    # Creating empty lists for carrying commit data
    doclist = []
    # Using list to update progress bar because it's thread-safe
    completed_commits = []

    # Create Multithreading pool to use full CPU
    # Ref: https://pythonspeed.com/articles/python-multiprocessing/
    pool = mp.Pool(mp.cpu_count())

    global total_commits_count
    # If the Repo has just been cloned, the program will traverse the whole Repo
    # https://dzone.com/articles/shared-counter-python%E2%80%99s
    if str_from_date == "All":
        commits = RepositoryMining(git_directory_path).traverse_commits()
        # 'more_itertools' used here to find commits count as 'commits' is Iterable
        # Note: ilen(commits) consumes the iterable 'commits'
        total_commits_count = more_itertools.ilen(commits)

        [
            pool.apply_async(process_commit(commit, doclist,
                                            completed_commits)) for commit in
            RepositoryMining(git_directory_path).traverse_commits()
        ]
        # Close Multiprocessing pool
        pool.close()
        pool.join()
    else:
        arr_from_date = str_from_date.split("-")
        arr_to_date = str_to_date.split("-")

        dt_from = datetime(int(arr_from_date[0]), int(arr_from_date[1]),
                           int(arr_from_date[2]), 0, 0, 0)
        dt_to = datetime(int(arr_to_date[0]), int(arr_to_date[1]),
                         int(arr_to_date[2]), 0, 0, 0)
        commits = RepositoryMining(git_directory_path, since=dt_from,
                                   to=dt_to).traverse_commits()
        # 'more_itertools' used here to find commits count as 'commits' is Iterable
        # Note: ilen(commits) consumes the iterable 'commits'
        total_commits_count = more_itertools.ilen(commits)
        if total_commits_count == 0:
            dict_callback_start_mining["msg"] = "no_commits"
            print(json.dumps(dict_callback_start_mining))
            return
        [
            pool.apply_async(process_commit(commit, doclist,
                                            completed_commits))
            for commit in RepositoryMining(git_directory_path,
                                           since=dt_from,
                                           to=dt_to).traverse_commits()
        ]
        # Close Multiprocessing pool
        pool.close()
        pool.join()

    # We have data in json format but we need output as csv.
    # There are many approaches to doing this including using dictionaries and stuff.
    # But the easiest way is to write json to file using json.dump and using pandas to read json file.
    # Write data to temp file since pandas.read_json expects file. We can probably optimise without having to
    #     create a new file.
    temp_file = os.path.join(devranker_dir, 'mod_data.json')
    with open(temp_file, 'w') as temp_out_file:
        # json.dump cannot handle python datetime object. We should convert this object to 'str'
        # https://stackoverflow.com/questions/11875770/how-to-overcome-datetime-datetime-not-json-serializable
        # https://code-maven.com/serialize-datetime-object-as-json-in-python
        json.dump(doclist, temp_out_file, default=str)

    # Use pandas to read json and write to csv.
    df = pandas.read_json(temp_file)
    df.to_csv(output_file_path)

    # Remove the temp file
    os.remove(temp_file)
    # display_data_file_location_path()
    # Inform user that mining is complete

    dict_callback_start_mining["msg"] = "Done"
    dict_callback_start_mining["tc"] = 0
    dict_callback_start_mining["cc"] = 0
    print(json.dumps(dict_callback_start_mining))
Exemplo n.º 21
0
def test_badly_formatted_repo_url():
    with pytest.raises(Exception):
        list(RepositoryMining(path_to_repo=set('repo')).traverse_commits())
Exemplo n.º 22
0
def get_commit_count(project_path):
    result = 0
    for commit in RepositoryMining(project_path).traverse_commits():
        result += 1
    return result
Exemplo n.º 23
0
#!/usr/bin/env python3
#Takes in a commit hash and returns the code associated with that hash
#Hash in question is a hash marked with a cyclomatic complexity of 2041 and I was curious to see what that looks like lol
from pydriller import RepositoryMining

project_url = 'https://github.com/NationalSecurityAgency/ghidra.git'

hash = "2df81f803b99e0900c298f0213dfb7d0911052b1"
count = 0
avgLinesOfCode = 0
CommitList = []

with open("codeSegment.txt", 'w') as myfile:
    for commit in RepositoryMining(project_url).traverse_commits():
        for m in commit.modifications:
            if (commit.hash == hash):
                myfile.write(m.source_code)

myfile.close()
Exemplo n.º 24
0
import sys
from pydriller import RepositoryMining
import csv
from functions import remove_duplicate_commits, dictionary_of_spoon_output
import subprocess

countOfArgs = len(sys.argv)
pathToRepo = None
if countOfArgs == 2:
    pathToRepo = sys.argv[1]
else:
    pathToRepo = '../repository/'
with open('output/pathToRepo.csv', 'w') as myFile:
    myFile.write(pathToRepo)
changes = []
for commit in RepositoryMining(pathToRepo, only_modifications_with_file_types=['.java']).traverse_commits():
        for modification in commit.modifications:
            if modification.change_type is not None:
                extOfFile = modification.filename[modification.filename.find('.') + 1:]
                if extOfFile == 'java' and (modification.change_type.name == 'MODIFY') or \
                        (modification.change_type.name == 'RENAME'):
                    changes.append([commit.parents[0], modification.old_path, commit.hash, modification.new_path])
with open('output/changes.csv', 'w') as myFile:
    wr = csv.writer(myFile)
    wr.writerows(changes)
listOfCommitsToIterate = remove_duplicate_commits(changes)
with open('output/inputForSpoon.csv', 'w') as myFile:
    commitsCount = len(listOfCommitsToIterate)
    position = 1
    for key, value in listOfCommitsToIterate.items():
        if position != commitsCount:
from pydriller import RepositoryMining
import pandas as pd

data = pd.read_excel('../Data/types_algos_occurences.xlsx')
writer = pd.ExcelWriter('../Data/types.xlsx')
df = pd.DataFrame(data)
i = 0
while i < len(df):
    cmp = 0
    string = df.at[i, 'name'].lower()
    for commit in RepositoryMining('../../scikit-learn').traverse_commits():
        if (string in commit.msg.lower()):
            cmp += 1
    df.at[i, 'nb'] = cmp
    i += 1
df.to_excel(writer)
writer.save()
writer.close()
Exemplo n.º 26
0
def miner():
    repo_path = os.path.abspath(working_path + repo_name)
    # Clone if necessary
    if not os.path.exists(repo_path):
        print("Cloning: {}".format(repo_name))
        for c in RepositoryMining(repo_git,
                                  clone_repo_to=os.path.abspath(
                                      working_path)).traverse_commits():
            pass
    else:
        print("{} clone done!".format(repo_name))

    # Extract FIX and BIC
    bic_csv = os.path.abspath(working_path + repo_name + "_all.csv")
    header = [
        "hash", "path", "size", "developer", "type", "fix", "bic_path",
        "bic_hash", "bic_size"
    ]
    if not os.path.exists(bic_csv):
        print("Extracting FIX and BIC")
        out_file = open(bic_csv, 'w', newline='', encoding="utf-8")
        writer = csv.DictWriter(out_file, delimiter=',', fieldnames=header)
        writer.writeheader()
        to_date = datetime(2017, 12, 1, 12, 0, 0)
        gr = GitRepository(repo_path)
        gr2 = GitRepository(repo_path)
        for commit in RepositoryMining(
                repo_path,
                to=to_date,
                only_no_merge=True,
                only_modifications_with_file_types=extensions,
                reversed_order=True).traverse_commits():
            msg = commit.msg.lower()
            mods = commit.modifications
            if len(mods) < 50 and any(word in msg for word in keywords):
                dout = {
                    "hash": commit.hash,
                    "size": len(mods),
                    "developer": commit.committer.email,
                    "fix": True
                }
                for mod in mods:
                    dout["type"] = mod.change_type
                    if mod.change_type == ModificationType.DELETE:
                        dout["path"] = mod.old_path
                    else:
                        dout["path"] = mod.new_path
                    bics_per_mod = gr.get_commits_last_modified_lines(
                        commit, mod)
                    for bic_path, bic_commit_hashs in bics_per_mod.items():
                        dout["bic_path"] = bic_path
                        for bic_commit_hash in bic_commit_hashs:
                            bic = gr2.get_commit(bic_commit_hash)
                            dout["bic_hash"] = bic_commit_hash
                            dout["bic_size"] = len(bic.modifications)
                            writer.writerow(dout)
                            out_file.flush()
            else:
                dout = {
                    "hash": commit.hash,
                    "size": len(mods),
                    "developer": commit.committer.email,
                    "fix": False,
                    "bic_path": "---",
                    "bic_hash": "---",
                    "bic_size": "---"
                }
                for mod in mods:
                    dout["path"] = mod.new_path
                    writer.writerow(dout)
                    out_file.flush()
        out_file.close()
    else:
        print("Extracting FIX and BIC done!")

    # Get unique BIC
    in_file = open(bic_csv, 'r', newline='', encoding="utf-8")
    reader = csv.DictReader(in_file, delimiter=',')
    unique_devs = set()
    unique_commits = set()
    fixes = {}
    unique_bics = set()
    unique_fics = set()
    for row in reader:
        unique_commits.add(row["hash"])
        if row["path"].endswith(tuple(extensions)):
            unique_devs.add(row["developer"])
            unique_bics.add(row["bic_hash"])
            unique_fics.add(row["bic_path"])
            if row["fix"] == "True":
                fixes[row["hash"]] = True
    unique_bics.remove("---")
    unique_fics.remove("---")
    in_file.close()
    print("Developers: {}, Commits: {} Defective: {}".format(
        len(unique_devs), len(unique_commits), len(fixes)))

    # Save list of BIC
    unique_bic_txt = os.path.abspath(working_path + repo_name +
                                     "_unique_bic.txt")
    out_file = open(unique_bic_txt, 'w', newline='', encoding="utf-8")
    for bic in unique_bics:
        out_file.write(bic)
        out_file.write("\n")
    out_file.close()

    # Save list of FIX
    unique_fix_txt = os.path.abspath(working_path + repo_name +
                                     "_unique_fix.txt")
    out_file = open(unique_fix_txt, 'w', newline='', encoding="utf-8")
    for fix in fixes:
        out_file.write(fix)
        out_file.write("\n")
    out_file.close()

    # Count fully and partially defective commits, and defective files in defective commits
    bic_csv = os.path.abspath(working_path + repo_name + "_bic_metrics.csv")
    header = ["bic_hash", "bic_size", "bic_path", "defective"]
    if not os.path.exists(bic_csv):
        print("Counting partial BIC")
        out_file = open(bic_csv, 'w', newline='', encoding="utf-8")
        writer = csv.DictWriter(out_file, delimiter=',', fieldnames=header)
        writer.writeheader()
        gr = GitRepository(repo_path)
        for bic_hash in unique_bics:
            commit = gr.get_commit(bic_hash)
            diff = count_file = len(commit.modifications)
            dout = {
                "bic_hash": bic_hash,
                "bic_size": len(commit.modifications)
            }
            for mod in commit.modifications:
                if mod.filename.endswith(
                        tuple(extensions)
                ) and mod.change_type is not ModificationType.DELETE:
                    dout["bic_path"] = mod.new_path
                    if mod.new_path in unique_fics:
                        diff -= 1
                        dout["defective"] = True
                    else:
                        dout["defective"] = False
                    writer.writerow(dout)
                    out_file.flush()
                else:
                    count_file -= 1
                    diff -= 1
        out_file.close()
    else:
        print("Counting partial BIC done!")

    # Calculate partially defective commits
    in_file = open(bic_csv, 'r', newline='', encoding="utf-8")
    reader = csv.DictReader(in_file, delimiter=',')
    bics = {}
    fully_defective = partially_defective = 0
    partially_defective_files = total_defective_files = 0
    for row in reader:
        if row["bic_path"].endswith(tuple(extensions)):
            if row["bic_hash"] in bics:
                bics[row["bic_hash"]].append(row["defective"])
            else:
                bics[row["bic_hash"]] = [row["defective"]]
    for key, value in bics.items():
        count_defective_files = value.count("True")
        if len(value) > 1:
            total_defective_files += count_defective_files
        if len(value) == count_defective_files:
            fully_defective += 1
        else:
            partially_defective += 1
            partially_defective_files += len(value) - count_defective_files
    ratio_defective_files_in_defective_commits = round(
        (partially_defective_files / total_defective_files) * 100, 1)
    ratio_partially_defective_commits = round(
        (partially_defective / (fully_defective + partially_defective)) * 100,
        1)
    print(
        "Partially def. commits: {}%. Defective files in partially def. commits: {}%"
        .format(ratio_partially_defective_commits,
                ratio_defective_files_in_defective_commits))
Exemplo n.º 27
0
def findBugCausingCommits(projectMap, local_repos_directory, output_directory):

    bugInducingProjectMap = {}

    for project, commits in projectMap.items():

        print("finding bug causing commits for ",
              str(local_repos_directory) + "/" + project)

        if (os.path.exists(
                str(output_directory) + "/" + str(project) +
                "_bug_causing_commits") and os.path.isfile(
                    str(output_directory) + "/" + str(project) +
                    "_bug_causing_commits")):
            print(project, "already analyzed, skipping...")
            continue

        repo_path = str(local_repos_directory) + "/" + project

        repo = GitRepository(repo_path)

        startTime = time.time()

        bugInducingCommits = []

        hashes = [x["commit_hash"] for x in commits]

        try:

            # analyze each bug fix for this project
            for bugFix in RepositoryMining(
                    repo_path, only_commits=hashes).traverse_commits():

                # get the commits that last touched the modified lines of the files
                commitsLastTouchedFix = repo.get_commits_last_modified_lines(
                    bugFix)

                bugCausingHashes = set([])

                for filename, fileCommit in commitsLastTouchedFix.items():

                    for fileHash in fileCommit:
                        bugCausingHashes.add(fileHash)

                hashList = [x for x in bugCausingHashes]

                # get average statistics about each of these commits
                # number of files modified for the commit
                # number of lines added for the commit
                # number of lines removed for the commit
                # number of methods changed for the commit
                # author of the commit
                # the elapsed time for the bug fix
                # branches
                for bugCausingCommit in RepositoryMining(
                        repo_path, only_commits=hashList).traverse_commits():

                    numModifiedFiles = len(bugCausingCommit.modifications)
                    linesAdded = 0
                    linesRemoved = 0
                    numMethodsChanged = 0
                    sum_nloc = 0
                    numFilesWithComplexity = 0
                    sumComplexity = 0

                    if numModifiedFiles <= 0: continue

                    for modification in bugCausingCommit.modifications:
                        sourceCodeLanguage = LanguageDetector.detect(
                            modification.filename)
                        try:
                            if (sourceCodeLanguage == None
                                    or modification.nloc == 0
                                    or modification.nloc is None):
                                continue
                        except:
                            pass
                        sum_nloc = sum_nloc + modification.nloc
                        linesAdded = linesAdded + modification.added
                        linesRemoved = linesRemoved + modification.removed
                        numMethodsChanged = numMethodsChanged + len(
                            modification.changed_methods)
                        if modification.complexity:
                            numFilesWithComplexity = numFilesWithComplexity + 1
                            sumComplexity = sumComplexity + modification.complexity

                    averageComplexityFixedFiles = 0

                    if (numFilesWithComplexity != 0):
                        averageComplexityFixedFiles = sumComplexity / numFilesWithComplexity

                    bugInducingInfo = {
                        "commit_hash":
                        bugCausingCommit.hash,
                        "author":
                        bugCausingCommit.author.name,
                        "total_complexity":
                        sumComplexity,
                        "average_complexity":
                        averageComplexityFixedFiles,
                        "sum_nloc":
                        sum_nloc,
                        "num_files":
                        numModifiedFiles,
                        "lines_added":
                        linesAdded,
                        "lines_removed":
                        linesRemoved,
                        "commit_date":
                        bugCausingCommit.author_date,
                        "branches":
                        bugCausingCommit.branches,
                        "num_methods_changed":
                        numMethodsChanged,
                        "time_to_fix":
                        bugFix.author_date - bugCausingCommit.author_date
                    }

                    # print(bugInducingInfo["commit_hash"])
                    # print(bugInducingInfo["author"])
                    # print(bugInducingInfo["total_complexity"])
                    # print(bugInducingInfo["average_complexity"])
                    # print(bugInducingInfo["sum_nloc"])
                    # print(bugInducingInfo["num_files"])
                    # print(bugInducingInfo["lines_added"])
                    # print(bugInducingInfo["lines_removed"])
                    # print(bugInducingInfo["commit_date"])
                    # print(bugInducingInfo["branches"])
                    # print(bugInducingInfo["num_methods_changed"])
                    # print(bugInducingInfo["time_to_fix"])

                    bugInducingCommits.append(bugInducingInfo)

            tempMap = {project: bugInducingCommits}

            IOUtils.writeBugMap(tempMap, output_directory,
                                "_bug_causing_commits")

            endTime = time.time()

            print("time", endTime - startTime)

        except:
            print("FAILED FOR", project)
            pass
Exemplo n.º 28
0
def test_simple_url():
    assert 5 == len(
        list(
            RepositoryMining(
                path_to_repo="test-repos/test1").traverse_commits()))
from pydriller import RepositoryMining

for commit in RepositoryMining('https://github.com/java-native-access/jna').traverse_commits():
    print('hash {} authored by {}'.format(commit.hash, commit.author.name))





Exemplo n.º 30
0
        for x in node.body:
            if isinstance(x, ast.FunctionDef):
                Contador += 1
        super(MyCustomVisitor, self).generic_visit(node)


class RodarAnalise():
    def __init__(self):
        pass


if __name__ == "__main__":

    numeroDoComit = 0
    for lista in RepositoryMining(
            'https://github.com/WilliamCDL/Testeinicial',
            only_modifications_with_file_types=['.py']).traverse_commits():

        for arquivos in lista.modifications:

            if arquivos.filename.endswith('.py'):
                try:
                    root = ast.parse(arquivos.source_code)
                except (SyntaxError, ValueError) as e1:
                    #logging.error('{}\n\t{}'.format(e1.msg, e1.text))
                    continue
                except IndentationError as e2:
                    logging.error(e2.print_file_and_line)
                    continue
                visitor = MyCustomVisitor()
                visitor.visit(root)