def _get_users_labels_in_issues_and_pulls(self): """ Collects the author_association of issue/pull requests (opened, closed or merged) :return: lists of author_associations per issue/pull requests :rtype: list """ path = self.path + '/' + self.project json = JSONHandler(path + '/') issues = json.open_json(self.project + '_issues.json') pulls = json.open_json(self.project + '_pulls.json') # author_association = [['id', 'association', 'created_at']] author_association = {} for issue in issues: if issue['author_association']: # author_association.append([issue['issue_number'], issue['author_association'], issue['created_at']]) author_association[ issue['issue_number']] = issue['author_association'] for pull in pulls: if pull['author_association']: author_association[ pull['pull_request_number']] = pull['author_association'] # author_association.append([pull['pull_request_number'], pull['author_association'], pull['created_at']]) return author_association
def opened_employee_or_temporary(self): """ Collect the status of the user that opened the issue/pull request. Employee if has an author_association OWNER, MEMBER, COLLABORATOR or CONTRIBUTOR. Temporary else. :return: list of the status of the user that opened the issue/pull request by issue/pull request :rtype: list """ print("#### Opened by Employee or Temporary ####") path = self.path + '/' + self.project json = JSONHandler(path + '/') issues = json.open_json(self.project + '_issues.json') pulls = json.open_json(self.project + '_pulls.json') opened_by = [['number', 'status', 'user']] for issue in issues: if 'author_association' in issue.keys(): # print(issue['author_association']) opened_by.append( [issue['issue_number'], self._employee_or_temporary(issue['author_association']), issue['user']]) for pull in pulls: if 'author_association' in pull.keys(): opened_by.append( [pull['pull_request_number'], self._employee_or_temporary(pull['author_association']), pull['user']]) csv = CSVHandler() csv.write_csv(self.path + '/' + self.project + '/metrics/', self.project + '_opened_by.csv', opened_by) return opened_by
def collect_commits_on_pulls(self, owner: str, project: str): """ Collect Commits from Pull Requests from the GitHub API :param owner: repository owner :type owner: str :param project: project name :type project: str :return: list of commits from pull requests :rtype: list """ print('Collecting Pull Requests Commits') pulls = [] mypath = self.config['output_path'] + project + '/pulls/all/' json = JSONHandler(mypath) commits_json = JSONHandler(self.config['output_path'] + project + '/pulls_commits/commits/') onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] for file in onlyfiles: batch = json.open_json(file) for pull in batch: pulls.append(pull['number']) hashs = [] for pull in pulls: if JSONHandler.file_exists( self.config['output_path'] + project + '/pulls_commits/commits/' + str(pull) + '.json'): commits_pull = commits_json.open_json( str(pull) + '.json') for commit_pull in commits_pull: for commit in commit_pull: hashs.append(commit['sha']) continue pullsEndpoint = PrototypeAPI(owner, project, '/pulls_commits/', '/pulls/' + str(pull) + '/commits') files = pullsEndpoint.collect_batch(False) commits_json.save_json(files, str(pull)) commitsEndpoint = PrototypeAPI(owner, project, '/pulls_commits/', '/commits') aux = 1 for hash in hashs: if not hash: continue commitsEndpoint.collect_single(hash) print(str(aux * 100 / len(hashs)) + "%") aux = aux + 1 return hashs
def _get_comments_in_discussion(self): """ Collect comments of each issue and pull request and the number of words of each comment :return: two lists, one containing the comments in issues/pull requests and another with the words per issue/pull request. :rtype: list, list """ mypath = self.path + self.project + '/comments/individual/' json = JSONHandler(mypath) onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] words_in_discussion = {} comments_in_discussion = {} for file in onlyfiles: comments = json.open_json(file) for comment in comments: if 'issue_url' in comment.keys(): issue = comment['issue_url'].split('/') issue = issue[len(issue) - 1] if issue not in words_in_discussion.keys(): words_in_discussion[issue] = 0 if issue not in comments_in_discussion.keys(): comments_in_discussion[issue] = 0 tp = TextProcessing() processed = tp.pre_process_text(comment['body']) comment_text = '' for token in processed: comment_text += token + ' ' words_in_discussion[issue] += len(comment_text.split(' ')) comments_in_discussion[issue] += 1 return comments_in_discussion, words_in_discussion
def _get_users_labels_in_comments(self): """ Collects the author_association of each comment on issue/pull requests :return: lists of author_associations per issue/pull requests :rtype: list """ mypath = self.path + self.project + '/comments/individual/' json = JSONHandler(mypath) onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] users = {} hash = {} for file in onlyfiles: comments = json.open_json(file) for comment in comments: issue = comment['issue_url'].split('/') issue = issue[len(issue) - 1] if issue not in users.keys(): users[issue] = [] if str(issue + comment['user']['login']) not in hash.keys(): hash[issue + comment['user']['login']] = 0 users[issue].append(comment['author_association']) return users
def get_comments_in_discussion(self): mypath = self.path + self.project + '/comments/individual/' json = JSONHandler(mypath) onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] words_in_discussion = {} comments_in_discussion = {} for file in onlyfiles: comments = json.open_json(file) for comment in comments: if 'issue_url' in comment.keys(): issue = comment['issue_url'].split('/') issue = issue[len(issue) - 1] if issue not in words_in_discussion.keys(): words_in_discussion[issue] = 0 if issue not in comments_in_discussion.keys(): comments_in_discussion[issue] = 0 tp = TextProcessing() processed = tp.pre_process_text(comment['body']) comment_text = '' for token in processed: comment_text += token + ' ' words_in_discussion[issue] += len(comment_text.split(' ')) comments_in_discussion[issue] += 1 return comments_in_discussion, words_in_discussion
def mean_time_between_replies(self): """ Collect the mean time between comments inside an issue or pull request :return: list if mean time between comments per issue/pull request :rtype: list """ print('#### Mean Time Between Comments ####') mypath = self.path + self.project + '/comments/individual/' json = JSONHandler(mypath) onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] comments_per_issue = {} for file in onlyfiles: comments = json.open_json(file) for comment in comments: issue = comment['issue_url'].split('/') issue = issue[len(issue) - 1] if issue not in comments_per_issue.keys(): comments_per_issue[issue] = [] comments_per_issue[issue].append(comment['created_at']) date_utils = DateUtils() mean_time = [['issue', 'mean_time']] for key in comments_per_issue.keys(): days_between = [] sorted_dates = date_utils.sort_dates(comments_per_issue[key]) aux = None for date in sorted_dates: if not aux: aux = date continue days = date_utils.get_days_between_dates(aux, date) days_between.append(days) aux = date length = len(days_between) length += 1 sum_days = sum(days_between) mean_days = sum_days / length mean_time.append([key, mean_days]) csv = CSVHandler() csv.write_csv(self.path + '/' + self.project + '/metrics/', self.project + '_mean_time_between_replies.csv', mean_time) return mean_time
def get_number_of_patches(self): """ Collects the number of snippets inside each comment of issues and pull requests. :return: list of the number of snippets per issue or pull request :rtype: list """ print('#### Number of Snippets ####') mypath = self.path + self.project + '/comments/individual/' json = JSONHandler(mypath) onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] patches_in_discussion = {} for file in onlyfiles: comments = json.open_json(file) for comment in comments: if 'issue_url' in comment.keys(): issue = comment['issue_url'].split('/') issue = issue[len(issue) - 1] if issue not in patches_in_discussion.keys(): patches_in_discussion[issue] = 0 if '```' in comment['body']: patches = comment['body'].split('```') count = 0 aux = 0 if issue not in self.patches_size.keys(): self.patches_size[issue] = 0 for patch in patches: if len(patches) != 1: aux += 1 if aux % 2 != 0: continue self.patches_size[issue] += len(patch) count += 1 patches_in_discussion[issue] += count number_of_patches_in_discussion = [['issue', 'number_patches']] for key in patches_in_discussion.keys(): number_of_patches_in_discussion.append( [key, patches_in_discussion[key]]) csv = CSVHandler() csv.write_csv(self.path + '/' + self.project + '/metrics/', self.project + '_patches_in_discussion.csv', number_of_patches_in_discussion) return number_of_patches_in_discussion
def insert_commits_from_pulls(self): database = self.database['commits'] pulls_commits_path = self.path + 'pulls_commits/commits/' json = JSONHandler(pulls_commits_path) commits = [f for f in listdir(pulls_commits_path) if isfile(join(pulls_commits_path, f))] commit_pulls = {} for file in commits: commit_batch = json.open_json(file) for commit_list in commit_batch: for commit in commit_list: if commit['sha'] not in commit_pulls.keys(): commit_pulls[commit['sha']] = [] commit_pulls[commit['sha']].append(file.split('.')[0]) pulls_commits_path = self.path + 'pulls_commits/individual/' json = JSONHandler(pulls_commits_path) commits = [f for f in listdir(pulls_commits_path) if isfile(join(pulls_commits_path, f))] for file in commits: commit = json.open_json(file) if database.find_one({'sha': commit['sha']}): database.update_one({'sha': commit['sha']}, {"$set": {'from_pull': True}}) if commit['sha'] in commit_pulls.keys(): database.update_one({'sha': commit['sha']}, {"$set": {'pull_origin': commit_pulls[commit['sha']]}}) continue commit['from_pull'] = True commit['pull_origin'] = [] if commit['sha'] in commit_pulls.keys(): commit['pull_origin'] = commit_pulls[commit['sha']] database.insert_one(commit)
def get_time_in_days_between_open_and_close(self): """ Collects the time in days between the day an issue or pull request was opened and the day it was closed. :return: list of time in days per issue/pull request. :rtype: list """ print('#### Discussion Length ####') path = self.path + '/' + self.project json = JSONHandler(path + '/') issues = json.open_json(self.project + '_issues.json') pulls = json.open_json(self.project + '_pulls.json') days_between = [['number', 'status']] date_utils = DateUtils() for issue in issues: days = 0 if 'closed' in issue['state']: days = date_utils.get_days_between_dates( issue['created_at'], issue['closed_at']) # print(issue['author_association']) days_between.append([issue['issue_number'], days]) for pull in pulls: days = 0 if 'closed' in pull['state']: if pull['merged_at']: days = date_utils.get_days_between_dates( pull['created_at'], pull['merged_at']) else: days = date_utils.get_days_between_dates( pull['created_at'], pull['closed_at']) days_between.append([pull['pull_request_number'], days]) csv = CSVHandler() csv.write_csv(self.path + '/' + self.project + '/metrics/', self.project + '_discussion_length.csv', days_between) return days_between
def insert_comments(self): database = self.database['comments'] comments_path = self.path + 'comments/individual/' json = JSONHandler(comments_path) comments = [f for f in listdir(comments_path) if isfile(join(comments_path, f))] for file in comments: comment_batch = json.open_json(file) for comment in comment_batch: if database.find_one({'id': comment['id']}): continue issue_number = comment['issue_url'].split('issues/')[1] comment['issue_number'] = int(issue_number) database.insert_one(comment) comments_path = self.path + 'comments/issues/all/' json = JSONHandler(comments_path) comments = [f for f in listdir(comments_path) if isfile(join(comments_path, f))] for file in comments: comment_batch = json.open_json(file) for comment in comment_batch: if database.find_one({'id': comment['id']}): continue issue_number = comment['issue_url'].split('issues/')[1] comment['issue_number'] = int(issue_number) database.insert_one(comment)
def insert_pulls(self): database = self.database['pull_requests'] pulls_path = self.path + 'pulls/individual/' json = JSONHandler(pulls_path) pulls = [f for f in listdir(pulls_path) if isfile(join(pulls_path, f))] for file in pulls: pull = json.open_json(file) if database.find_one({'number': pull['number']}): continue database.insert_one(pull)
def insert_issues(self): database = self.database['issues'] issues_path = self.path + 'issues/individual/' json = JSONHandler(issues_path) issues = [f for f in listdir(issues_path) if isfile(join(issues_path, f))] for file in issues: issue = json.open_json(file) if database.find_one({'number': issue['number']}): continue database.insert_one(issue)
def get_median_of_number_of_comments(self): """ Collects the median of the number of comments inside an issue or pull requests :return: list with the median of the number of comments per issue or pull request :rtype: list """ print("#### Median Comments ####") mypath = self.path + self.project + '/comments/individual/' json = JSONHandler(mypath) onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] comments_per_issue = {} for file in onlyfiles: comments = json.open_json(file) for comment in comments: if 'issue_url' in comment.keys(): issue = comment['issue_url'].split('/') issue = issue[len(issue) - 1] if int(issue) not in comments_per_issue: comments_per_issue[int(issue)] = 0 comments_per_issue[int(issue)] = comments_per_issue[int(issue)] + 1 values = [] median_comments = [['issue', 'median_comments']] number_comments = [['id', 'number_comments']] for key in sorted(comments_per_issue): #print(str(key) + ': ' + str(comments_per_issue[key])) values.append(comments_per_issue[key]) median_comments.append([key, median(values)]) number_comments.append([key, comments_per_issue[key]]) csv = CSVHandler() csv.write_csv(self.path + '/' + self.project + '/metrics/', self.project + '_median_comments.csv', median_comments) csv.write_csv(self.path + '/' + self.project + '/metrics/', self.project + '_number_comments.csv', number_comments) return number_comments
def insert_commits(self): database = self.database['commits'] commits_path = self.path + 'commits/individual/' json = JSONHandler(commits_path) commits = [f for f in listdir(commits_path) if isfile(join(commits_path, f))] for file in commits: commit = json.open_json(file) if database.find_one({'sha': commit['sha']}): continue commit['from_pull'] = False commit['pull_origin'] = [] database.insert_one(commit)
def insert_events(self): database = self.database['events'] events_path = self.path + 'events/all/' json = JSONHandler(events_path) events = [f for f in listdir(events_path) if isfile(join(events_path, f))] for file in events: event_batch = json.open_json(file) for event in event_batch: if database.find_one({'id': event['id']}): continue database.insert_one(event) pass
def __init__(self): json_handler = JSONHandler('C:/Users/gurio/PycharmProjects/GHPyFramework/') self.config = json_handler.open_json('config.json') self.projects = self.config['projects']