def processBatch(batchIdx: int, commits: List[Commit], config: Configuration): allRelatedAuthors = {} authorCommits = Counter({}) # for all commits... print("Analyzing centrality") for commit in Bar("Processing").iter(commits): author = authorIdExtractor(commit.author) # increase author commit count authorCommits.update({author: 1}) # initialize dates for related author analysis commitDate = datetime.fromtimestamp(commit.committed_date) earliestDate = commitDate + relativedelta(months=-1) latestDate = commitDate + relativedelta(months=+1) commitRelatedCommits = filter( lambda c: findRelatedCommits(author, earliestDate, latestDate, c), commits ) commitRelatedAuthors = set( list(map(lambda c: authorIdExtractor(c.author), commitRelatedCommits)) ) # get current related authors collection and update it authorRelatedAuthors = allRelatedAuthors.setdefault(author, set()) authorRelatedAuthors.update(commitRelatedAuthors) return prepareGraph( allRelatedAuthors, authorCommits, batchIdx, "commitCentrality", config )
def findRelatedCommits(author, earliestDate, latestDate, commit): isDifferentAuthor = author != authorIdExtractor(commit.author) if not isDifferentAuthor: return False commitDate = datetime.fromtimestamp(commit.committed_date) isInRange = commitDate >= earliestDate and commitDate <= latestDate return isInRange
def replaceAll(commits, aliases): for commit in Bar("Processing").iter(list(commits)): copy = commit author = authorIdExtractor(commit.author) if author in aliases: copy.author.email = aliases[author] yield copy
def extractAliases(repo: git.Repo, aliasPath: str, repoShortname: str, token: str, maxDistance: float): commits = list(repo.iter_commits()) # get all distinct author emails emails = set( authorIdExtractor(commit.author) for commit in Bar("Processing").iter(commits)) # get a commit per email shasByEmail = {} for email in Bar("Processing").iter(emails): commit = next(commit for commit in repo.iter_commits() if authorIdExtractor(commit.author) == email) shasByEmail[email] = commit.hexsha # query github for author logins by their commits loginsByEmail = dict() emailsWithoutLogins = [] for email in Bar("Processing").iter(shasByEmail): sha = shasByEmail[email] url = "https://api.github.com/repos/{}/commits/{}".format( repoShortname, sha) request = requests.get(url, headers={"Authorization": "token " + token}) commit = request.json() if not "author" in commit.keys(): continue if not commit["author"] is None and not commit["author"][ "login"] is None: loginsByEmail[email] = commit["author"]["login"] else: emailsWithoutLogins.append(email) # build initial alias collection from logins aliases = {} usedAsValues = {} for email in loginsByEmail: login = loginsByEmail[email] aliasEmails = aliases.setdefault(login, []) aliasEmails.append(email) usedAsValues[email] = login if (len(emailsWithoutLogins) > 0): for authorA in Bar("Processing").iter(emailsWithoutLogins): quickMatched = False # go through used values for key in usedAsValues: if authorA == key: quickMatched = True continue if areSimilar(authorA, key, maxDistance): alias = usedAsValues[key] aliases[alias].append(authorA) usedAsValues[authorA] = alias quickMatched = True break if quickMatched: continue # go through already extracted keys for key in aliases: if authorA == key: quickMatched = True continue if areSimilar(authorA, key, maxDistance): aliases[key].append(authorA) usedAsValues[authorA] = key quickMatched = True break if quickMatched: continue # go through all authors for authorB in emailsWithoutLogins: if authorA == authorB: continue if areSimilar(authorA, authorB, maxDistance): aliasedAuthor = aliases.setdefault(authorA, []) aliasedAuthor.append(authorB) usedAsValues[authorB] = authorA break print("Writing aliases to '{0}'".format(aliasPath)) if not os.path.exists(os.path.dirname(aliasPath)): os.makedirs(os.path.dirname(aliasPath)) with open(aliasPath, "a", newline="") as f: yaml.dump(aliases, f)
def centralityAnalysis(repo: git.Repo, commits: List[git.Commit], outputDir: str): allRelatedAuthors = {} authorCommits = Counter({}) # for all commits... print("Analyzing centrality") for commit in Bar("Processing").iter(commits): author = authorIdExtractor(commit.author) # increase author commit count authorCommits.update({author: 1}) # initialize dates for related author analysis commitDate = datetime.fromtimestamp(commit.committed_date) earliestDate = commitDate + relativedelta(months=-1) latestDate = commitDate + relativedelta(months=+1) # find authors related to this commit # commitRelatedCommits = commit.iter_items( # repo, 'master', # after=earliestDate.strftime('%Y-%m-%d'), # before=latestDate.strftime('%Y-%m-%d')) commitRelatedCommits = filter( lambda c: findRelatedCommits(author, earliestDate, latestDate, c), commits) commitRelatedAuthors = set( list( map(lambda c: authorIdExtractor(c.author), commitRelatedCommits))) # get current related authors collection and update it authorRelatedAuthors = allRelatedAuthors.setdefault(author, set()) authorRelatedAuthors.update(commitRelatedAuthors) # prepare graph print("Preparing NX graph") G = nx.Graph() for author in allRelatedAuthors: G.add_node(author) for relatedAuthor in allRelatedAuthors[author]: G.add_edge(author.strip(), relatedAuthor.strip()) # analyze graph closeness = dict(nx.closeness_centrality(G)) betweenness = dict(nx.betweenness_centrality(G)) centrality = dict(nx.degree_centrality(G)) density = nx.density(G) modularity = [] try: for idx, community in enumerate(greedy_modularity_communities(G)): authorCount = len(community) communityCommitCount = sum(authorCommits[author] for author in community) row = [authorCount, communityCommitCount] modularity.append(row) except ZeroDivisionError: # not handled pass # finding high centrality authors numberHighCentralityAuthors = len([ author for author, centrality in centrality.items() if centrality > 0.5 ]) percentageHighCentralityAuthors = numberHighCentralityAuthors / len( allRelatedAuthors) print("Outputting CSVs") # output non-tabular results with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Density", density]) w.writerow(["Community Count", len(modularity)]) # output community information with open(os.path.join(outputDir, "community.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Community Index", "Author Count", "Commit Count"]) for idx, community in enumerate(modularity): w.writerow([idx + 1, community[0], community[1]]) # combine centrality results combined = {} for key in closeness: single = { "Author": key, "Closeness": closeness[key], "Betweenness": betweenness[key], "Centrality": centrality[key], } combined[key] = single # output tabular results with open(os.path.join(outputDir, "centrality.csv"), "w", newline="") as f: w = csv.DictWriter( f, ["Author", "Closeness", "Betweenness", "Centrality"]) w.writeheader() for key in combined: w.writerow(combined[key]) # output high centrality authors with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow( ["NumberHighCentralityAuthors", numberHighCentralityAuthors]) w.writerow([ "PercentageHighCentralityAuthors", percentageHighCentralityAuthors ]) # output statistics outputStatistics( [value for key, value in closeness.items()], "Closeness", outputDir, ) outputStatistics( [value for key, value in betweenness.items()], "Betweenness", outputDir, ) outputStatistics( [value for key, value in centrality.items()], "Centrality", outputDir, ) outputStatistics( [community[0] for community in modularity], "CommunityAuthorCount", outputDir, ) outputStatistics( [community[1] for community in modularity], "CommunityCommitCount", outputDir, ) # output graph to PNG print("Outputting graph to PNG") graphFigure = plt.figure(5, figsize=(30, 30)) nx.draw( G, with_labels=True, node_color="orange", node_size=4000, edge_color="black", linewidths=2, font_size=20, ) graphFigure.savefig(os.path.join(outputDir, "graph.png"))
def commitAnalysis(commits: List[git.Commit], outputDir: str): authorInfoDict = {} timezoneInfoDict = {} experienceDays = 150 # traverse all commits print("Analyzing commits") for commit in Bar("Processing").iter(commits): # extract info author = authorIdExtractor(commit.author) timezone = commit.author_tz_offset time = commit.authored_datetime # get timezone timezoneInfo = timezoneInfoDict.setdefault( timezone, dict(commitCount=0, authors=set()) ) # save author timezoneInfo["authors"].add(author) # increase commit count timezoneInfo["commitCount"] += 1 # get author authorInfo = authorInfoDict.setdefault( author, dict( commitCount=0, sponsoredCommitCount=0, earliestCommitDate=time, latestCommitDate=time, sponsored=False, activeDays=0, experienced=False, ), ) # increase commit count authorInfo["commitCount"] += 1 # validate earliest commit # by default GitPython orders commits from latest to earliest if time < authorInfo["earliestCommitDate"]: authorInfo["earliestCommitDate"] = time # check if commit was between 9 and 5 if not commit.author_tz_offset == 0 and time.hour >= 9 and time.hour <= 17: authorInfo["sponsoredCommitCount"] += 1 print("Analyzing authors") sponsoredAuthorCount = 0 for login, author in authorInfoDict.items(): # check if sponsored commitCount = int(author["commitCount"]) sponsoredCommitCount = int(author["sponsoredCommitCount"]) diff = sponsoredCommitCount / commitCount if diff >= 0.95: author["sponsored"] = True sponsoredAuthorCount += 1 # calculate active days earliestDate = author["earliestCommitDate"] latestDate = author["latestCommitDate"] activeDays = (latestDate - earliestDate).days + 1 author["activeDays"] = activeDays # check if experienced if activeDays >= experienceDays: author["experienced"] = True # calculate percentage sponsored authors percentageSponsoredAuthors = sponsoredAuthorCount / len([*authorInfoDict]) # calculate active project days firstCommitDate = datetime.fromtimestamp(commits[len(commits) - 1].committed_date) lastCommitDate = datetime.fromtimestamp(commits[0].committed_date) daysActive = (lastCommitDate - firstCommitDate).days print("Outputting CSVs") # output author days on project with open(os.path.join(outputDir, "authorDaysOnProject.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Author", "# of Days"]) for login, author in authorInfoDict.items(): w.writerow([login, author["activeDays"]]) # output commits per author with open(os.path.join(outputDir, "commitsPerAuthor.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Author", "Commit Count"]) for login, author in authorInfoDict.items(): w.writerow([login, author["commitCount"]]) # output timezones with open(os.path.join(outputDir, "timezones.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["Timezone Offset", "Author Count", "Commit Count"]) for timezone in timezoneInfoDict: timezoneInfo = timezoneInfoDict[timezone] w.writerow( [timezone, len(timezoneInfo["authors"]), timezoneInfo["commitCount"]] ) # output project info with open(os.path.join(outputDir, "project.csv"), "a", newline="") as f: w = csv.writer(f, delimiter=",") w.writerow(["DaysActive", daysActive]) w.writerow(["FirstCommitDate", "{:%Y-%m-%d}".format(firstCommitDate)]) w.writerow(["LastCommitDate", "{:%Y-%m-%d}".format(lastCommitDate)]) w.writerow(["AuthorCount", len([*authorInfoDict])]) w.writerow(["SponsoredAuthorCount", sponsoredAuthorCount]) w.writerow(["PercentageSponsoredAuthors", percentageSponsoredAuthors]) w.writerow(["TimezoneCount", len([*timezoneInfoDict])]) outputStatistics( [author["activeDays"] for login, author in authorInfoDict.items()], "ActiveDays", outputDir, ) outputStatistics( [author["commitCount"] for login, author in authorInfoDict.items()], "CommitCount", outputDir, ) return authorInfoDict
def commitBatchAnalysis( idx: int, senti: PySentiStr, commits: List[git.Commit], config: Configuration ): authorInfoDict = {} timezoneInfoDict = {} experienceDays = 150 # traverse all commits print("Analyzing commits") startDate = None if config.startDate is not None: startDate = datetime.strptime(config.startDate, "%Y-%m-%d") startDate = startDate.replace(tzinfo=pytz.UTC) # sort commits commits.sort(key=lambda o: o.committed_datetime, reverse=True) commitMessages = [] commit: Commit lastDate = None firstDate = None realCommitCount = 0 for commit in Bar("Processing").iter(commits): if startDate is not None and startDate > commit.committed_datetime: continue if lastDate is None: lastDate = commit.committed_date firstDate = commit.committed_date realCommitCount = realCommitCount + 1 # extract info author = authorIdExtractor(commit.author) timezone = commit.author_tz_offset time = commit.authored_datetime # get timezone timezoneInfo = timezoneInfoDict.setdefault( timezone, dict(commitCount=0, authors=set()) ) # save info timezoneInfo["authors"].add(author) if commit.message and commit.message.strip(): commitMessages.append(commit.message) # increase commit count timezoneInfo["commitCount"] += 1 # get author authorInfo = authorInfoDict.setdefault( author, dict( commitCount=0, sponsoredCommitCount=0, earliestCommitDate=time, latestCommitDate=time, sponsored=False, activeDays=0, experienced=False, ), ) # increase commit count authorInfo["commitCount"] += 1 # validate earliest commit # by default GitPython orders commits from latest to earliest if time < authorInfo["earliestCommitDate"]: authorInfo["earliestCommitDate"] = time # check if commit was between 9 and 5 if not commit.author_tz_offset == 0 and time.hour >= 9 and time.hour <= 17: authorInfo["sponsoredCommitCount"] += 1 print("Analyzing commit message sentiment") sentimentScores = [] commitMessageSentimentsPositive = [] commitMessageSentimentsNegative = [] if len(commitMessages) > 0: sentimentScores = senti.getSentiment(commitMessages) commitMessageSentimentsPositive = list( result for result in filter(lambda value: value >= 1, sentimentScores) ) commitMessageSentimentsNegative = list( result for result in filter(lambda value: value <= -1, sentimentScores) ) print("Analyzing authors") sponsoredAuthorCount = 0 for login, author in authorInfoDict.items(): # check if sponsored commitCount = int(author["commitCount"]) sponsoredCommitCount = int(author["sponsoredCommitCount"]) diff = sponsoredCommitCount / commitCount if diff >= 0.95: author["sponsored"] = True sponsoredAuthorCount += 1 # calculate active days earliestDate = author["earliestCommitDate"] latestDate = author["latestCommitDate"] activeDays = (latestDate - earliestDate).days + 1 author["activeDays"] = activeDays # check if experienced if activeDays >= experienceDays: author["experienced"] = True # calculate percentage sponsored authors percentageSponsoredAuthors = sponsoredAuthorCount / len([*authorInfoDict]) # calculate active project days firstCommitDate = None lastCommitDate = None if firstDate is not None: firstCommitDate = datetime.fromtimestamp(firstDate) if lastDate is not None: lastCommitDate = datetime.fromtimestamp(lastDate) daysActive = 0 if lastCommitDate is not None: daysActive = (lastCommitDate - firstCommitDate).days print("Outputting CSVs") # output author days on project with open( os.path.join(config.metricsPath, f"authorDaysOnProject_{idx}.csv"), "a", newline="", ) as f: w = csv.writer(f, delimiter=",") w.writerow(["Author", "# of Days"]) for login, author in authorInfoDict.items(): w.writerow([login, author["activeDays"]]) # output commits per author with open( os.path.join(config.metricsPath, f"commitsPerAuthor_{idx}.csv"), "a", newline="" ) as f: w = csv.writer(f, delimiter=",") w.writerow(["Author", "Commit Count"]) for login, author in authorInfoDict.items(): w.writerow([login, author["commitCount"]]) # output timezones with open( os.path.join(config.metricsPath, f"timezones_{idx}.csv"), "a", newline="" ) as f: w = csv.writer(f, delimiter=",") w.writerow(["Timezone Offset", "Author Count", "Commit Count"]) for key, timezone in timezoneInfoDict.items(): w.writerow([key, len(timezone["authors"]), timezone["commitCount"]]) # output results with open( os.path.join(config.resultsPath, f"results_{idx}.csv"), "a", newline="" ) as f: w = csv.writer(f, delimiter=",") w.writerow(["CommitCount", realCommitCount]) w.writerow(["DaysActive", daysActive]) w.writerow(["FirstCommitDate", "{:%Y-%m-%d}".format(firstCommitDate)]) w.writerow(["LastCommitDate", "{:%Y-%m-%d}".format(lastCommitDate)]) w.writerow(["AuthorCount", len([*authorInfoDict])]) w.writerow(["SponsoredAuthorCount", sponsoredAuthorCount]) w.writerow(["PercentageSponsoredAuthors", percentageSponsoredAuthors]) w.writerow(["TimezoneCount", len([*timezoneInfoDict])]) outputStatistics( idx, [author["activeDays"] for login, author in authorInfoDict.items()], "AuthorActiveDays", config.resultsPath, ) outputStatistics( idx, [author["commitCount"] for login, author in authorInfoDict.items()], "AuthorCommitCount", config.resultsPath, ) outputStatistics( idx, [len(timezone["authors"]) for key, timezone in timezoneInfoDict.items()], "TimezoneAuthorCount", config.resultsPath, ) outputStatistics( idx, [timezone["commitCount"] for key, timezone in timezoneInfoDict.items()], "TimezoneCommitCount", config.resultsPath, ) outputStatistics( idx, sentimentScores, "CommitMessageSentiment", config.resultsPath, ) outputStatistics( idx, commitMessageSentimentsPositive, "CommitMessageSentimentsPositive", config.resultsPath, ) outputStatistics( idx, commitMessageSentimentsNegative, "CommitMessageSentimentsNegative", config.resultsPath, ) return authorInfoDict, daysActive