예제 #1
0
def processBatch(batchIdx: int, commits: List[Commit], config: Configuration):
    allRelatedAuthors = {}
    authorCommits = Counter({})

    # for all commits...
    print("Analyzing centrality")
    for commit in Bar("Processing").iter(commits):
        author = authorIdExtractor(commit.author)

        # increase author commit count
        authorCommits.update({author: 1})

        # initialize dates for related author analysis
        commitDate = datetime.fromtimestamp(commit.committed_date)
        earliestDate = commitDate + relativedelta(months=-1)
        latestDate = commitDate + relativedelta(months=+1)

        commitRelatedCommits = filter(
            lambda c: findRelatedCommits(author, earliestDate, latestDate, c), commits
        )

        commitRelatedAuthors = set(
            list(map(lambda c: authorIdExtractor(c.author), commitRelatedCommits))
        )

        # get current related authors collection and update it
        authorRelatedAuthors = allRelatedAuthors.setdefault(author, set())
        authorRelatedAuthors.update(commitRelatedAuthors)

    return prepareGraph(
        allRelatedAuthors, authorCommits, batchIdx, "commitCentrality", config
    )
예제 #2
0
def findRelatedCommits(author, earliestDate, latestDate, commit):
    isDifferentAuthor = author != authorIdExtractor(commit.author)
    if not isDifferentAuthor:
        return False

    commitDate = datetime.fromtimestamp(commit.committed_date)
    isInRange = commitDate >= earliestDate and commitDate <= latestDate
    return isInRange
예제 #3
0
def replaceAll(commits, aliases):
    for commit in Bar("Processing").iter(list(commits)):
        copy = commit
        author = authorIdExtractor(commit.author)

        if author in aliases:
            copy.author.email = aliases[author]

        yield copy
예제 #4
0
def extractAliases(repo: git.Repo, aliasPath: str, repoShortname: str,
                   token: str, maxDistance: float):
    commits = list(repo.iter_commits())

    # get all distinct author emails
    emails = set(
        authorIdExtractor(commit.author)
        for commit in Bar("Processing").iter(commits))

    # get a commit per email
    shasByEmail = {}
    for email in Bar("Processing").iter(emails):

        commit = next(commit for commit in repo.iter_commits()
                      if authorIdExtractor(commit.author) == email)

        shasByEmail[email] = commit.hexsha

    # query github for author logins by their commits
    loginsByEmail = dict()
    emailsWithoutLogins = []

    for email in Bar("Processing").iter(shasByEmail):
        sha = shasByEmail[email]
        url = "https://api.github.com/repos/{}/commits/{}".format(
            repoShortname, sha)
        request = requests.get(url,
                               headers={"Authorization": "token " + token})
        commit = request.json()

        if not "author" in commit.keys():
            continue

        if not commit["author"] is None and not commit["author"][
                "login"] is None:
            loginsByEmail[email] = commit["author"]["login"]
        else:
            emailsWithoutLogins.append(email)

    # build initial alias collection from logins
    aliases = {}
    usedAsValues = {}

    for email in loginsByEmail:
        login = loginsByEmail[email]
        aliasEmails = aliases.setdefault(login, [])
        aliasEmails.append(email)
        usedAsValues[email] = login

    if (len(emailsWithoutLogins) > 0):
        for authorA in Bar("Processing").iter(emailsWithoutLogins):
            quickMatched = False

            # go through used values
            for key in usedAsValues:
                if authorA == key:
                    quickMatched = True
                    continue

                if areSimilar(authorA, key, maxDistance):
                    alias = usedAsValues[key]
                    aliases[alias].append(authorA)
                    usedAsValues[authorA] = alias
                    quickMatched = True
                    break

            if quickMatched:
                continue

            # go through already extracted keys
            for key in aliases:
                if authorA == key:
                    quickMatched = True
                    continue

                if areSimilar(authorA, key, maxDistance):
                    aliases[key].append(authorA)
                    usedAsValues[authorA] = key
                    quickMatched = True
                    break

            if quickMatched:
                continue

            # go through all authors
            for authorB in emailsWithoutLogins:
                if authorA == authorB:
                    continue

                if areSimilar(authorA, authorB, maxDistance):
                    aliasedAuthor = aliases.setdefault(authorA, [])
                    aliasedAuthor.append(authorB)
                    usedAsValues[authorB] = authorA
                    break

    print("Writing aliases to '{0}'".format(aliasPath))
    if not os.path.exists(os.path.dirname(aliasPath)):
        os.makedirs(os.path.dirname(aliasPath))

    with open(aliasPath, "a", newline="") as f:
        yaml.dump(aliases, f)
예제 #5
0
def centralityAnalysis(repo: git.Repo, commits: List[git.Commit],
                       outputDir: str):

    allRelatedAuthors = {}
    authorCommits = Counter({})

    # for all commits...
    print("Analyzing centrality")
    for commit in Bar("Processing").iter(commits):
        author = authorIdExtractor(commit.author)

        # increase author commit count
        authorCommits.update({author: 1})

        # initialize dates for related author analysis
        commitDate = datetime.fromtimestamp(commit.committed_date)
        earliestDate = commitDate + relativedelta(months=-1)
        latestDate = commitDate + relativedelta(months=+1)

        # find authors related to this commit
        #        commitRelatedCommits = commit.iter_items(
        #                repo, 'master',
        #                after=earliestDate.strftime('%Y-%m-%d'),
        #                before=latestDate.strftime('%Y-%m-%d'))

        commitRelatedCommits = filter(
            lambda c: findRelatedCommits(author, earliestDate, latestDate, c),
            commits)

        commitRelatedAuthors = set(
            list(
                map(lambda c: authorIdExtractor(c.author),
                    commitRelatedCommits)))

        # get current related authors collection and update it
        authorRelatedAuthors = allRelatedAuthors.setdefault(author, set())
        authorRelatedAuthors.update(commitRelatedAuthors)

    # prepare graph
    print("Preparing NX graph")
    G = nx.Graph()

    for author in allRelatedAuthors:
        G.add_node(author)

        for relatedAuthor in allRelatedAuthors[author]:
            G.add_edge(author.strip(), relatedAuthor.strip())

    # analyze graph
    closeness = dict(nx.closeness_centrality(G))
    betweenness = dict(nx.betweenness_centrality(G))
    centrality = dict(nx.degree_centrality(G))
    density = nx.density(G)
    modularity = []

    try:
        for idx, community in enumerate(greedy_modularity_communities(G)):
            authorCount = len(community)
            communityCommitCount = sum(authorCommits[author]
                                       for author in community)
            row = [authorCount, communityCommitCount]
            modularity.append(row)
    except ZeroDivisionError:
        # not handled
        pass

    # finding high centrality authors
    numberHighCentralityAuthors = len([
        author for author, centrality in centrality.items() if centrality > 0.5
    ])

    percentageHighCentralityAuthors = numberHighCentralityAuthors / len(
        allRelatedAuthors)

    print("Outputting CSVs")

    # output non-tabular results
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Density", density])
        w.writerow(["Community Count", len(modularity)])

    # output community information
    with open(os.path.join(outputDir, "community.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Community Index", "Author Count", "Commit Count"])
        for idx, community in enumerate(modularity):
            w.writerow([idx + 1, community[0], community[1]])

    # combine centrality results
    combined = {}
    for key in closeness:
        single = {
            "Author": key,
            "Closeness": closeness[key],
            "Betweenness": betweenness[key],
            "Centrality": centrality[key],
        }

        combined[key] = single

    # output tabular results
    with open(os.path.join(outputDir, "centrality.csv"), "w", newline="") as f:
        w = csv.DictWriter(
            f, ["Author", "Closeness", "Betweenness", "Centrality"])
        w.writeheader()

        for key in combined:
            w.writerow(combined[key])

    # output high centrality authors
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(
            ["NumberHighCentralityAuthors", numberHighCentralityAuthors])
        w.writerow([
            "PercentageHighCentralityAuthors", percentageHighCentralityAuthors
        ])

    # output statistics
    outputStatistics(
        [value for key, value in closeness.items()],
        "Closeness",
        outputDir,
    )

    outputStatistics(
        [value for key, value in betweenness.items()],
        "Betweenness",
        outputDir,
    )

    outputStatistics(
        [value for key, value in centrality.items()],
        "Centrality",
        outputDir,
    )

    outputStatistics(
        [community[0] for community in modularity],
        "CommunityAuthorCount",
        outputDir,
    )

    outputStatistics(
        [community[1] for community in modularity],
        "CommunityCommitCount",
        outputDir,
    )

    # output graph to PNG
    print("Outputting graph to PNG")
    graphFigure = plt.figure(5, figsize=(30, 30))
    nx.draw(
        G,
        with_labels=True,
        node_color="orange",
        node_size=4000,
        edge_color="black",
        linewidths=2,
        font_size=20,
    )
    graphFigure.savefig(os.path.join(outputDir, "graph.png"))
def commitAnalysis(commits: List[git.Commit], outputDir: str):

    authorInfoDict = {}
    timezoneInfoDict = {}
    experienceDays = 150

    # traverse all commits
    print("Analyzing commits")
    for commit in Bar("Processing").iter(commits):

        # extract info
        author = authorIdExtractor(commit.author)
        timezone = commit.author_tz_offset
        time = commit.authored_datetime

        # get timezone
        timezoneInfo = timezoneInfoDict.setdefault(
            timezone, dict(commitCount=0, authors=set())
        )

        # save author
        timezoneInfo["authors"].add(author)

        # increase commit count
        timezoneInfo["commitCount"] += 1

        # get author
        authorInfo = authorInfoDict.setdefault(
            author,
            dict(
                commitCount=0,
                sponsoredCommitCount=0,
                earliestCommitDate=time,
                latestCommitDate=time,
                sponsored=False,
                activeDays=0,
                experienced=False,
            ),
        )

        # increase commit count
        authorInfo["commitCount"] += 1

        # validate earliest commit
        # by default GitPython orders commits from latest to earliest
        if time < authorInfo["earliestCommitDate"]:
            authorInfo["earliestCommitDate"] = time

        # check if commit was between 9 and 5
        if not commit.author_tz_offset == 0 and time.hour >= 9 and time.hour <= 17:
            authorInfo["sponsoredCommitCount"] += 1

    print("Analyzing authors")
    sponsoredAuthorCount = 0
    for login, author in authorInfoDict.items():

        # check if sponsored
        commitCount = int(author["commitCount"])
        sponsoredCommitCount = int(author["sponsoredCommitCount"])
        diff = sponsoredCommitCount / commitCount
        if diff >= 0.95:
            author["sponsored"] = True
            sponsoredAuthorCount += 1

        # calculate active days
        earliestDate = author["earliestCommitDate"]
        latestDate = author["latestCommitDate"]
        activeDays = (latestDate - earliestDate).days + 1
        author["activeDays"] = activeDays

        # check if experienced
        if activeDays >= experienceDays:
            author["experienced"] = True

    # calculate percentage sponsored authors
    percentageSponsoredAuthors = sponsoredAuthorCount / len([*authorInfoDict])

    # calculate active project days
    firstCommitDate = datetime.fromtimestamp(commits[len(commits) - 1].committed_date)
    lastCommitDate = datetime.fromtimestamp(commits[0].committed_date)
    daysActive = (lastCommitDate - firstCommitDate).days

    print("Outputting CSVs")

    # output author days on project
    with open(os.path.join(outputDir, "authorDaysOnProject.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "# of Days"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["activeDays"]])

    # output commits per author
    with open(os.path.join(outputDir, "commitsPerAuthor.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "Commit Count"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["commitCount"]])

    # output timezones
    with open(os.path.join(outputDir, "timezones.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Timezone Offset", "Author Count", "Commit Count"])
        for timezone in timezoneInfoDict:
            timezoneInfo = timezoneInfoDict[timezone]
            w.writerow(
                [timezone, len(timezoneInfo["authors"]), timezoneInfo["commitCount"]]
            )

    # output project info
    with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["DaysActive", daysActive])
        w.writerow(["FirstCommitDate", "{:%Y-%m-%d}".format(firstCommitDate)])
        w.writerow(["LastCommitDate", "{:%Y-%m-%d}".format(lastCommitDate)])
        w.writerow(["AuthorCount", len([*authorInfoDict])])
        w.writerow(["SponsoredAuthorCount", sponsoredAuthorCount])
        w.writerow(["PercentageSponsoredAuthors", percentageSponsoredAuthors])
        w.writerow(["TimezoneCount", len([*timezoneInfoDict])])

    outputStatistics(
        [author["activeDays"] for login, author in authorInfoDict.items()],
        "ActiveDays",
        outputDir,
    )

    outputStatistics(
        [author["commitCount"] for login, author in authorInfoDict.items()],
        "CommitCount",
        outputDir,
    )

    return authorInfoDict
예제 #7
0
def commitBatchAnalysis(
    idx: int, senti: PySentiStr, commits: List[git.Commit], config: Configuration
):

    authorInfoDict = {}
    timezoneInfoDict = {}
    experienceDays = 150

    # traverse all commits
    print("Analyzing commits")
    startDate = None
    if config.startDate is not None:
        startDate = datetime.strptime(config.startDate, "%Y-%m-%d")
        startDate = startDate.replace(tzinfo=pytz.UTC)
    # sort commits
    commits.sort(key=lambda o: o.committed_datetime, reverse=True)

    commitMessages = []
    commit: Commit
    lastDate = None
    firstDate = None
    realCommitCount = 0
    for commit in Bar("Processing").iter(commits):
        if startDate is not None and startDate > commit.committed_datetime:
            continue
        if lastDate is None:
            lastDate = commit.committed_date
        firstDate = commit.committed_date
        realCommitCount = realCommitCount + 1
        # extract info
        author = authorIdExtractor(commit.author)
        timezone = commit.author_tz_offset
        time = commit.authored_datetime

        # get timezone
        timezoneInfo = timezoneInfoDict.setdefault(
            timezone, dict(commitCount=0, authors=set())
        )

        # save info
        timezoneInfo["authors"].add(author)

        if commit.message and commit.message.strip():
            commitMessages.append(commit.message)

        # increase commit count
        timezoneInfo["commitCount"] += 1

        # get author
        authorInfo = authorInfoDict.setdefault(
            author,
            dict(
                commitCount=0,
                sponsoredCommitCount=0,
                earliestCommitDate=time,
                latestCommitDate=time,
                sponsored=False,
                activeDays=0,
                experienced=False,
            ),
        )

        # increase commit count
        authorInfo["commitCount"] += 1

        # validate earliest commit
        # by default GitPython orders commits from latest to earliest
        if time < authorInfo["earliestCommitDate"]:
            authorInfo["earliestCommitDate"] = time

        # check if commit was between 9 and 5
        if not commit.author_tz_offset == 0 and time.hour >= 9 and time.hour <= 17:
            authorInfo["sponsoredCommitCount"] += 1

    print("Analyzing commit message sentiment")
    sentimentScores = []
    commitMessageSentimentsPositive = []
    commitMessageSentimentsNegative = []

    if len(commitMessages) > 0:
        sentimentScores = senti.getSentiment(commitMessages)
        commitMessageSentimentsPositive = list(
            result for result in filter(lambda value: value >= 1, sentimentScores)
        )
        commitMessageSentimentsNegative = list(
            result for result in filter(lambda value: value <= -1, sentimentScores)
        )

    print("Analyzing authors")
    sponsoredAuthorCount = 0
    for login, author in authorInfoDict.items():

        # check if sponsored
        commitCount = int(author["commitCount"])
        sponsoredCommitCount = int(author["sponsoredCommitCount"])
        diff = sponsoredCommitCount / commitCount
        if diff >= 0.95:
            author["sponsored"] = True
            sponsoredAuthorCount += 1

        # calculate active days
        earliestDate = author["earliestCommitDate"]
        latestDate = author["latestCommitDate"]
        activeDays = (latestDate - earliestDate).days + 1
        author["activeDays"] = activeDays

        # check if experienced
        if activeDays >= experienceDays:
            author["experienced"] = True

    # calculate percentage sponsored authors
    percentageSponsoredAuthors = sponsoredAuthorCount / len([*authorInfoDict])

    # calculate active project days
    firstCommitDate = None
    lastCommitDate = None
    if firstDate is not None:
        firstCommitDate = datetime.fromtimestamp(firstDate)
    if lastDate is not None:
        lastCommitDate = datetime.fromtimestamp(lastDate)
    daysActive = 0
    if lastCommitDate is not None:
        daysActive = (lastCommitDate - firstCommitDate).days

    print("Outputting CSVs")

    # output author days on project
    with open(
        os.path.join(config.metricsPath, f"authorDaysOnProject_{idx}.csv"),
        "a",
        newline="",
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "# of Days"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["activeDays"]])

    # output commits per author
    with open(
        os.path.join(config.metricsPath, f"commitsPerAuthor_{idx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "Commit Count"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["commitCount"]])

    # output timezones
    with open(
        os.path.join(config.metricsPath, f"timezones_{idx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Timezone Offset", "Author Count", "Commit Count"])
        for key, timezone in timezoneInfoDict.items():
            w.writerow([key, len(timezone["authors"]), timezone["commitCount"]])

    # output results
    with open(
        os.path.join(config.resultsPath, f"results_{idx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["CommitCount", realCommitCount])
        w.writerow(["DaysActive", daysActive])
        w.writerow(["FirstCommitDate", "{:%Y-%m-%d}".format(firstCommitDate)])
        w.writerow(["LastCommitDate", "{:%Y-%m-%d}".format(lastCommitDate)])
        w.writerow(["AuthorCount", len([*authorInfoDict])])
        w.writerow(["SponsoredAuthorCount", sponsoredAuthorCount])
        w.writerow(["PercentageSponsoredAuthors", percentageSponsoredAuthors])
        w.writerow(["TimezoneCount", len([*timezoneInfoDict])])

    outputStatistics(
        idx,
        [author["activeDays"] for login, author in authorInfoDict.items()],
        "AuthorActiveDays",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        [author["commitCount"] for login, author in authorInfoDict.items()],
        "AuthorCommitCount",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        [len(timezone["authors"]) for key, timezone in timezoneInfoDict.items()],
        "TimezoneAuthorCount",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        [timezone["commitCount"] for key, timezone in timezoneInfoDict.items()],
        "TimezoneCommitCount",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        sentimentScores,
        "CommitMessageSentiment",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        commitMessageSentimentsPositive,
        "CommitMessageSentimentsPositive",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        commitMessageSentimentsNegative,
        "CommitMessageSentimentsNegative",
        config.resultsPath,
    )

    return authorInfoDict, daysActive