def fillCommit_IssuesTable_Through_Commit(commits, projectName,subsystem,commit_rows ): connection = getDBConn() cursor = connection .cursor() sql_Query = "SELECT Distinct IssueID FROM Subsystems_Issues WHERE SubsystemID = '"+subsystem+"'" cursor.execute(sql_Query) issues_rows = cursor.fetchall() for git_commit in commits: message = git_commit['data']['message'] for issue_row in issues_rows: issue_number = issue_row[0][issue_row[0].rfind('#')+1:] if re.search(r'\b' + ' #'+issue_number + r'\b', message) or re.search(r'\b'+ ' \(#'+issue_number + r'\b', message): #if re.search(r'\b' + issue_row[0] + r'\b', message): #if re.search(r'\b' + ' #'+issue_row[8] + r'\b', comments): for commit_row in commit_rows: if commit_row[1] == git_commit['data']['commit']: sql_Query = "INSERT IGNORE INTO commits_Issues (CommitID,IssueID) VALUES ('"+str(commit_row[0])+"','"+issue_row[0]+"') ON DUPLICATE KEY UPDATE CommitID = CommitID, IssueID = IssueID " cursor.execute(sql_Query) print("I am inserting"+"commit:"+str(commit_row[0])+"issue:"+issue_row[0]) sql_Query = "INSERT INTO Involvement (ID, ProjectID,Type,SourceURL,CommitID,IssueID,Time) VALUES (default,'"+projectName+"','10','"+commit_row[1]+"','"+str(commit_row[0])+"','"+issue_row[0]+"','"+str(commit_row[2])+"')" cursor.execute(sql_Query) cursor.close() connection.commit() connection.close
def fillArtifacts_CommitsTable(commits, subsystem, projectName): connection = getDBConn() cursor = connection.cursor() sql_Query = "SELECT * FROM artifact WHERE SubsystemId = '"+subsystem+"'" cursor.execute(sql_Query) artifact_rows = cursor.fetchall() sql_Query = "SELECT * FROM commit where ProjectId = '"+projectName+"'" cursor.execute(sql_Query) commit_rows = cursor.fetchall() for commit in commits: for commit_row in commit_rows: if commit_row[1] == commit["data"]["commit"]: commitID = commit_row[0] files = commit["data"]["files"] for file in files: fileURL = file['file'] artifactID = "" for row in artifact_rows: if row[1] == fileURL: artifactID = row[0] print("Inserting artifact"+str(artifactID)) print("Inserting Commit"+str(commitID)) sql_Query = "INSERT IGNORE INTO artifacts_commits (ArtifactID,CommitID) VALUES ('"+str(artifactID)+"','"+str(commitID)+"') ON DUPLICATE KEY UPDATE ArtifactID=ArtifactID,CommitID = CommitID " #print("going to insert: "+ fileURL+" "+commit["data"]["commit"]) cursor.execute(sql_Query) cursor.close() connection.commit() connection.close return commit_rows
def fillInvolvementTable_Dev_CommitInvolvement(projectName, unique_developers): connection = getDBConn() cursor = connection .cursor() sql_Query = "SELECT * FROM commit where ProjectId = '"+projectName+"'" cursor.execute(sql_Query) commit_rows = cursor.fetchall() folder_component = dict() totalSubsystem = [] with open('folder-component.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: if row['project'] == projectName: print(row['subsystem'], row['folder']) folder_component.update({row['folder']:row['subsystem']}) totalSubsystem.append(row['subsystem']) for subsystem in totalSubsystem: print("subsystem"+ subsystem) linkedFolders =[k for k,v in folder_component.items() if v == subsystem ] subSystemURLs = [] for folder in linkedFolders: subSystemURLs.append(folder) repo_url = subSystemURLs[0]+ ".git" repo_dir = "/tmp/"+subsystem+".git" commits = getAllGitCommits(repo_url,repo_dir ) for commit in commits: for commit_row in commit_rows: if commit_row[1] == commit["data"]["commit"]: commitID = commit_row[0] nameAndEmail = commit["data"]["Author"] #name= nameAndEmail[:nameAndEmail.index("<")-1] email= nameAndEmail[nameAndEmail.find('<')+1:nameAndEmail.find('>')] developerID = email for item in unique_developers: if item['Linked_Developer_Email'] == email: #sql_Query = "INSERT INTO involvement (ID, ProjectID, UserID, Type, SourceURL, Time) VALUES (default,'"+projectName+"','"+item['Email']+"','10','"+commit["data"]["commit"]+"',from_unixtime("+str(commit["updated_on"])+"))" developerID = item['Email'] # check if commit is already present then update user id else insert commit involvement entery sql_Query = "SELECT * FROM involvement where ProjectId = '"+projectName+"' AND CommitID = '"+str(commitID)+"'" cursor.execute(sql_Query) involvement_rows = cursor.fetchall() if len(involvement_rows): for inv_row in involvement_rows: sql = "UPDATE involvement SET UserID = %s WHERE ID = %s" val = ( developerID, inv_row[0] ) cursor.execute(sql, val) continue else: sql_Query = "INSERT INTO involvement (ID, ProjectID, UserID, Type, SourceURL, CommitID, Time) VALUES (default,'"+projectName+"','"+developerID+"','10','"+commit["data"]["commit"]+"','"+str(commitID)+"',from_unixtime("+str(commit["updated_on"])+"))" cursor.execute(sql_Query) cursor.close() connection.commit() connection.close
def fillInvolvemenTable_Dev_IssueInvolvement(repo_owner, projectName, unique_developers): connection = getDBConn() cursor = connection .cursor() folder_component = dict() totalSubsystem = [] with open('folder-component.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: if row['project'] == projectName: print(row['subsystem'], row['folder']) folder_component.update({row['folder']:row['subsystem']}) totalSubsystem.append(row['subsystem']) for subsystem in totalSubsystem: print("subsystem"+ subsystem) linkedFolders =[k for k,v in folder_component.items() if v == subsystem ] subSystemURLs = [] for folder in linkedFolders: subSystemURLs.append(folder) issuesAndPullRequests = getAllGitIssues_pullRequests(repo_owner,subsystem ) #from perceval.backends.core.github import GitHub for pull_request_issues in issuesAndPullRequests.fetch(): issue_prefix = repo_owner+'/'+subsystem+'#' issue_ = issue_prefix + str(pull_request_issues['data']['number']) print(pull_request_issues['data']['number']) # Git reporter = first commenter developer_reporter = pull_request_issues['data']['user_data']['login'] creation_date = pull_request_issues['data']['created_at'][:pull_request_issues['data']['created_at'].find("Z")] for item in unique_developers: if item['Linked_Developer_Email'] == developer_reporter: developer_reporter = item['Email'] sql_Query = "INSERT INTO involvement (ID, ProjectID, UserID, Type, SourceURL, IssueID, Time) VALUES (default,'"+projectName+"','"+developer_reporter+"','5','"+issue_+"','"+issue_+"',STR_TO_DATE('"+creation_date+"','%Y-%m-%dT%H:%i:%s'))" cursor.execute(sql_Query) allcomments = pull_request_issues['data']['comments_data'] for comment in allcomments: developer_pullrequest_commenter = comment['user_data']['login'] for item in unique_developers: if item['Linked_Developer_Email'] == developer_pullrequest_commenter: developer_pullrequest_commenter = item['Email'] comment_date = comment['created_at'][:comment['created_at'].find("Z")] if comment_date.startswith( '-03-25T',4) : comment_date = comment_date[0:11]+ "14:54:51" print(issue_) sql_Query = "INSERT INTO involvement (ID, ProjectID, UserID, Type,SourceURL, IssueID, Time) VALUES (default,'"+projectName+"','"+developer_pullrequest_commenter+"','5','"+issue_+"','"+issue_+"',STR_TO_DATE('"+comment_date+"','%Y-%m-%dT%H:%i:%s'))" cursor.execute(sql_Query) cursor.close() connection.commit() connection.close
def fillCommitTable(commits, projectName, listOfNamesAndEmail): connection = getDBConn() cursor = connection .cursor() for commit in commits: nameAndEmail = commit["data"]["Author"] name= nameAndEmail[:nameAndEmail.index("<")-1] email= nameAndEmail[nameAndEmail.find('<')+1:nameAndEmail.find('>')] listOfNamesAndEmail.append({'name':name,'email':email}) sql_Query = "INSERT IGNORE INTO commit (CommitID,CommitURL, Date, ProjectID) VALUES (default,'"+commit["data"]["commit"]+"',from_unixtime("+str(commit["updated_on"])+"),'"+projectName+"') ON DUPLICATE KEY UPDATE CommitID=CommitID" cursor.execute(sql_Query) sql_Query = "UPDATE commit SET CommitJSONContent = %s WHERE CommitURL = %s" val = (str(json.dumps(commit)), commit["data"]["commit"]) cursor.execute(sql_Query, val) cursor.close() connection.commit() connection.close
def fillSubsystemTable(projectName, repo_owner, listOfNamesAndEmail): folder_component = dict() totalSubsystem = [] with open('folder-component.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: if row['project'] == projectName: print(row['subsystem'], row['folder']) folder_component.update({row['folder']:row['subsystem']}) totalSubsystem.append(row['subsystem']) for subsystem in totalSubsystem: connection = getDBConn() print("subsystem "+ subsystem) linkedFolders =[k for k,v in folder_component.items() if v == subsystem ] subSystemURLs = [] for folder in linkedFolders: subSystemURLs.append(folder) my_json_string = json.dumps(subSystemURLs) repo_url = subSystemURLs[0]+ ".git" cursor = connection .cursor() sql_Query = "INSERT IGNORE INTO subsystem (SubsystemID,ProjectID,Name,SubsystemURLs) VALUES ('"+subsystem+"','"+projectName+"','"+subsystem+"','"+my_json_string+"') ON DUPLICATE KEY UPDATE SubsystemID=SubsystemID" cursor.execute(sql_Query) cursor.close() connection.commit() connection.close repo_dir = "/tmp/"+subsystem+".git" commits = getAllGitCommits(repo_url,repo_dir ) issuesAndPullRequests = getAllGitIssues_pullRequests(repo_owner,subsystem ) print("i fetched everything fine") fillCommitTable(commits, projectName, listOfNamesAndEmail) print("fillCommitTable fine") fillArtifactTable(commits, projectName, subsystem) print("fillArtifactTable fine") commit_rows = fillArtifacts_CommitsTable(commits, subsystem, projectName) print(len(commit_rows)) print("fillArtifacts_CommitsTable fine") fillIssueTable_GitIssues_And_PullRequests(issuesAndPullRequests, projectName, repo_owner, subsystem, listOfNamesAndEmail) print("fillIssueTable_GitIssues_And_PullRequests fine") fillCommits_IssuesTable_Through_Issue(issuesAndPullRequests, repo_owner, projectName, subsystem, commit_rows) print("fillCommits_IssuesTable_Through_Issue fine") fillCommit_IssuesTable_Through_Commit(commits, projectName, subsystem, commit_rows ) print("fillCommit_IssuesTable_Through_Commit fine")
def fillArtifactTable(commits, projectName, subsystem): connection = getDBConn() cursor = connection .cursor() for commit in commits: files = commit["data"]["files"] for file in files: fileURL = file['file'] if fileURL.rfind("/") != -1: fileName = fileURL[fileURL.rfind("/")+1:] else: fileName = fileURL fileName = fileName.replace("'", '') fileName = fileName.replace(",", '') fileURL = fileURL.replace("'", '') fileURL = fileURL.replace(",", '') sql_Query = "INSERT IGNORE INTO artifact (ArtifactID,ArtifactURL,SubsystemID,Name,ProjectID) VALUES (default,'"+fileURL+"','"+subsystem+"','"+fileName+"','"+projectName+"') ON DUPLICATE KEY UPDATE ArtifactID=ArtifactID" cursor.execute(sql_Query) cursor.close() connection.commit() connection.close
def fillDeveloperTable(listOfNamesAndEmail, projectName): connection = getDBConn() cursor = connection .cursor() # remove duplicate developers listOfNamesAndEmail = [dict(t) for t in {tuple(d.items()) for d in listOfNamesAndEmail}] # transform developers data to feed de-Duplication algo transformation(listOfNamesAndEmail) # run deduplication duplicatePairs = deDuplication() # generate developers and linked developers csv generateLinkedDevelopers(duplicatePairs, listOfNamesAndEmail, projectName) # store unique developers in DB with open('developer_LinkedDevelopers.csv', encoding="utf-8") as f: reader = csv.DictReader(f) rows = [] for row in reader: if row['Project'] == projectName: row_name = row['DisplayName'] row_email = row['Email'] row_Linked_Developer_Name = row['Linked_Developer_DisplayName'] row_Linked_Developer_email = row['Linked_Developer_Email'] rows.append({'DisplayName':row_name,'Email':row_email,'Linked_Developer_DisplayName': row_Linked_Developer_Name,'Linked_Developer_Email':row_Linked_Developer_email}) for developer in listOfNamesAndEmail: found = False for item in rows: if item['Linked_Developer_Email'] == developer['email']: sql_Query = "INSERT IGNORE INTO developer (UserID,Email,DisplayName,ProjectID) VALUES ('"+item['Email']+"','"+item['Email']+"','"+item['DisplayName']+"','"+projectName+"') ON DUPLICATE KEY UPDATE UserID=UserID" cursor.execute(sql_Query) found = True if found == False: developer['name'] = developer['name'].replace("'",' ') sql_Query = "INSERT IGNORE INTO developer (UserID,Email,DisplayName,ProjectID) VALUES ('"+developer['email']+"','"+developer['email']+"','"+developer['name']+"','"+projectName+"') ON DUPLICATE KEY UPDATE UserID=UserID" cursor.execute(sql_Query) cursor.close() connection.commit() connection.close return rows
def fillCommits_IssuesTable_Through_Issue(issuesAndPullRequests, repo_owner, projectName, subsystem, commit_rows): connection = getDBConn() cursor = connection .cursor() #### pull requests have commits attached to them for pull in issuesAndPullRequests.fetch(category='pull_request'): data = pull['data'] #title = data['title'] print(data['number']) issue_prefix = repo_owner+'/'+subsystem+'#' issue_ = issue_prefix+str(data['number']) linkedCommits = data['commits_data'] for linkedCommit in linkedCommits: for commit_row in commit_rows: if commit_row[1] == linkedCommit: print(linkedCommit) print(commit_row[0]) sql_Query = "INSERT IGNORE INTO Commits_Issues (CommitID,IssueID) VALUES ('"+str(commit_row[0])+"','"+issue_+"') ON DUPLICATE KEY UPDATE CommitID = CommitID, IssueID = IssueID" cursor.execute(sql_Query) sql_Query = "INSERT INTO involvement (ID, ProjectID,Type,SourceURL,CommitID,IssueID,Time) VALUES (default,'"+projectName+"','10','"+issue_+"','"+str(commit_row[0])+"','"+issue_+"','"+str(commit_row[2])+"')" cursor.execute(sql_Query) cursor.close() connection.commit() connection.close
from communityGraphExtraction import convertIssueInvolvementToEdges from communityGraphExtraction import getArtChangeDatesPerDev from communityGraphExtraction import getMaxIssueInvolvementsPerUser from communityGraphExtraction import getSubsystems from communityGraphExtraction import getCrossSubsystemIssueLinks from communityGraphExtraction import convertCrossSubsystemIssueLinksToEdges from communityGraphExtraction import getCrossSubsystemDeveloperInvolvement from communityGraphExtraction import convertSubsystemDeveloperInvolvementToEdges from cypher_neo4j import graph2Cypher from string import Template import matplotlib.pyplot as plt import sys from sqlConnection import getDBConn connection = getDBConn() def generateDictForProjectWindow(projectId): graph = dict() overlapWindowInSeconds = 3600*24*30*4 #4months changes = getArtChangeDatesPerDev(projectId) graph['developerEdgesViaCommits'] = comparePairwiseDevArtifactChangeOverlap(changes, overlapWindowInSeconds) involvements = getMaxIssueInvolvementsPerUser(projectId) graph['developerEdgesViaIssues'] = convertIssueInvolvementToEdges(involvements) graph['developers'] = getDevelopersPerProject(projectId) return graph def generateDictForSubystems(projectID): graph = dict() graph['projectID'] = projectID (subsys, subsysParent) = getSubsystems(projectID)
def fillIssueTable_GitIssues_And_PullRequests(issuesAndPullRequests, projectName, repo_owner, subsystem, listOfNamesAndEmail): connection = getDBConn() cursor = connection .cursor() for item in issuesAndPullRequests.fetch(): if 'pull_request' in item['data']: print(item['data']['number']) issue_prefix = repo_owner+'/'+subsystem+'#' creation_date = item['data']['created_at'][:item['data']['created_at'].find("Z")] if item['data']['closed_at']is not None: close_date = item['data']['closed_at'][:item['data']['closed_at'].find("Z")] else: close_date = "" sql_Query = "INSERT IGNORE INTO issue (IssueID,CreationDate,CloseDate,GitPullRequestURL,GitPullRequestName, ProjectID) VALUES ('"+issue_prefix+str(item['data']['number'])+"',STR_TO_DATE('"+creation_date+"','%Y-%m-%dT%H:%i:%s'),STR_TO_DATE('"+close_date+"','%Y-%m-%dT%H:%i:%s'),'"+item['data']['url']+"','"+str(item['data']['number'])+"','"+projectName+"') ON DUPLICATE KEY UPDATE IssueID=IssueID " cursor.execute(sql_Query) sql_Query = "INSERT IGNORE INTO subsystems_Issues (SubsystemID,IssueID) VALUES ('"+subsystem+"','"+issue_prefix+str(item['data']['number'])+"') ON DUPLICATE KEY UPDATE SubsystemID=SubsystemID, IssueID=IssueID" cursor.execute(sql_Query) # Adding issue reporter userID= item['data']['user_data']['login'] displayName = item['data']['user_data']['name'] if not displayName: displayName = userID listOfNamesAndEmail.append({'name':displayName,'email':userID}) #Adding data of other members of the conversation issue_comments = list() issue_comments = item['data']['comments_data'] if len(issue_comments) != 0: # Adding Conversation starter's data only if ther are more than 1 comments userID= item['data']['user_data']['login'] displayName = item['data']['user_data']['name'] if not displayName: displayName = userID listOfNamesAndEmail.append({'name':displayName,'email':userID}) for issue_comment in issue_comments: userID= issue_comment['user_data']['login'] displayName = issue_comment['user_data']['name'] if not displayName: displayName = userID listOfNamesAndEmail.append({'name':displayName,'email':userID}) else: print(item['data']['number']) issue_prefix = repo_owner+'/'+subsystem+'#' creation_date = item['data']['created_at'][:item['data']['created_at'].find("Z")] if item['data']['closed_at']is not None: close_date = item['data']['closed_at'][:item['data']['closed_at'].find("Z")] else: close_date = "" sql_Query = "INSERT IGNORE INTO issue (IssueID,CreationDate,CloseDate,GitIssueURL,GitIssueName, ProjectID) VALUES ('"+issue_prefix+str(item['data']['number'])+"',STR_TO_DATE('"+creation_date+"','%Y-%m-%dT%H:%i:%s'),STR_TO_DATE('"+close_date+"','%Y-%m-%dT%H:%i:%s'),'"+item['data']['url']+"','"+str(item['data']['number'])+"','"+projectName+"') ON DUPLICATE KEY UPDATE IssueID=IssueID " cursor.execute(sql_Query) sql_Query = "INSERT IGNORE INTO subsystems_Issues (SubsystemID,IssueID) VALUES ('"+subsystem+"','"+issue_prefix+str(item['data']['number'])+"') ON DUPLICATE KEY UPDATE SubsystemID=SubsystemID, IssueID=IssueID" cursor.execute(sql_Query) # Adding issue reporter userID= item['data']['user_data']['login'] displayName = item['data']['user_data']['name'] if not displayName: displayName = userID listOfNamesAndEmail.append({'name':displayName,'email':userID}) #Adding data of other members of the conversation issue_comments = list() issue_comments = item['data']['comments_data'] if len(issue_comments) != 0: # Adding Conversation starter's data only if ther are more than 1 comments userID= item['data']['user_data']['login'] displayName = item['data']['user_data']['name'] if not displayName: displayName = userID listOfNamesAndEmail.append({'name':displayName,'email':userID}) for issue_comment in issue_comments: userID= issue_comment['user_data']['login'] displayName = issue_comment['user_data']['name'] if not displayName: displayName = userID listOfNamesAndEmail.append({'name':displayName,'email':userID}) sql = "UPDATE issue SET GitJSONContent = %s WHERE IssueID = %s" val = (str(json.dumps(item)), issue_prefix+str(item['data']['number'])) cursor.execute(sql, val) cursor.close() connection.commit() connection.close