def get_issue_events_dev(org_path, project_name, developer_login): path = org_path + '/' + project_name + '/Activities_Plots/' + developer_login os.makedirs(path, exist_ok=True) if ('complete_issues_events_repo.csv' in os.listdir(org_path + '/' + project_name + '/Other_Activities')): ### Get Other Issues Events issues_events = pandas.read_csv( org_path + '/' + project_name + '/Other_Activities/complete_issues_events_repo.csv', sep=',') issues_events_data = pandas.DataFrame( columns=['id', 'date', 'event', 'creator_login']) for index, event in issues_events.iterrows(): if (event['creator_login'] == developer_login): util.add(issues_events_data, event) if (len(issues_events_data) > 0): issues_events_data.to_csv(path + '/issues_events.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') print('{}: Issues Events Extraction Complete'.format(developer_login)) else: print('{}: No Issue Events'.format(project_name))
def writeCoreDevelopers(super_path, project_name): with open(super_path+'/'+project_name+'/inactivity_interval_list.csv', 'r') as f: #opens PW file inactivity_intervals_data = [list(map(str,rec)) for rec in csv.reader(f, delimiter=',')] #Read Break Dates Table with open(super_path+'/'+project_name+'/break_dates_list.csv', 'r') as f: break_dates_data = [list(map(str,rec)) for rec in csv.reader(f, delimiter=',')] breaks_df = pandas.DataFrame({'durations' : inactivity_intervals_data, 'datelimits' : break_dates_data}) # FILTER DEVELOPERS SLIDE_WIN_SIZE = 20 active_users_df = pandas.DataFrame(columns=['durations','datelimits']) path = (super_path+'/'+project_name) for index, row in breaks_df.iterrows(): num_breaks=len(row['durations'])-3 if (('[bot]' not in row['durations'][0]) & (num_breaks>=SLIDE_WIN_SIZE)): util.add(active_users_df, row) num_all_users = len(inactivity_intervals_data) num_active_users = len(active_users_df) logging.info('Project: '+project_name+'All Users: '+str(num_all_users)+' Breaks_Threshold/Sliding_Window: '+str(SLIDE_WIN_SIZE)+' Active Users: '+str(num_active_users)) active_users=[] for index, row in active_users_df.iterrows(): user_id=row['durations'][0] active_users.append(user_id) active_users_ids_df=pandas.DataFrame(active_users, columns=['id']) active_users_ids_df.to_csv(path+'/active_users.csv', sep=';', encoding='utf-8', na_rep='NA', header=True, index=False, mode='w', quoting=None, quotechar='"', line_terminator='\n', decimal='.') print('Core Developer Written for '+project_name) return active_users_ids_df
def writeCommitFile_Login(gith, project_url, start_date, end_date, path): import os, requests logger = open(path+'/commits_extraction.log','a+') exception_thrown = True while(exception_thrown): exception_thrown = False cfg.waitRateLimit(gith) repo = gith.get_repo(project_url) commits = repo.get_commits(since=start_date, until=end_date) #Fake users to be filtered out (author_id NOT IN (SELECT id from users where fake=1)) count_exception = True while(count_exception): count_exception = False try: num_items = commits.totalCount except github.GithubException as ghe: if str(ghe)=='500 None': print('Failed to get commits from this project (500 None: Ignoring Repo):', project_url) return elif str(ghe).startswith('409'): print('Failed to get commits from this project (409 Empty: Ignoring Repo):', project_url) return else: print('Failed to get commits from this project (GITHUB Unknown: Retrying):', project_url) count_exception=True pass except requests.exceptions.Timeout: print('Failed to get commits from this project (TIMEOUT: Retrying):', project_url) count_exception=True pass except: print('Failed to get commits from this project (Probably Empty): ', project_url) return last_page = int(num_items/cfg.items_per_page) last_page_read=0 if 'commits_raw_login.csv' in os.listdir(path): commits_data = pandas.read_csv(path+'/commits_raw_login.csv', sep=',') last_page_read = util.get_last_page_read_short(path+'/commits_extraction.log') else: commits_data=pandas.DataFrame(columns=['sha', 'author_id', 'date']) if 'excluded_for_NoneType.csv' in os.listdir(path): excluded_commits = pandas.read_csv(path+'/excluded_for_NoneType.csv', sep=',') else: excluded_commits=pandas.DataFrame(columns=['sha']) try: for page in range(last_page_read, last_page+1): commits_page = commits.get_page(page) for commit in commits_page: cfg.waitRateLimit(gith) sha=commit.sha if((sha not in commits_data.sha.tolist()) & (sha not in excluded_commits.sha.tolist())): if(commit.author): ### If author is NoneType, that means the author is no longer active in GitHub cfg.waitRateLimit(gith) author_id=commit.author.login ### HERE IS THE DIFFERENCE date=commit.commit.author.date util.add(commits_data,[sha, author_id, date]) if(len(commits_data)>0): commits_data.to_csv(path+'/commits_raw_login.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') except github.GithubException: print('Exception Occurred While Getting COMMITS: Github') if(len(commits_data)>0): commits_data.to_csv(path+'/commits_raw_login.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') logger.write('last_page:{}\n'.format(page)) logger.flush() exception_thrown = True pass except requests.exceptions.Timeout: print('Exception Occurred While Getting COMMITS: Timeout') if(len(commits_data)>0): commits_data.to_csv(path+'/commits_raw_login.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') logger.write('last_page:{}\n'.format(page)) logger.flush() exception_thrown = True pass except AttributeError: print('Exception Occurred While Getting COMMIT DATA: NoneType for Author. SHA: '+sha) util.add(excluded_commits, [sha]) if(len(commits_data)>0): commits_data.to_csv(path+'/commits_raw_login.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') excluded_commits.to_csv(path+'/excluded_for_NoneType.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') logger.write('last_page:{}\n'.format(page)) logger.flush() exception_thrown = True except: print('Execution Interrupted While Getting COMMITS') if(len(commits_data)>0): commits_data.to_csv(path+'/commits_raw_login.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') logger.write('last_page:{}\n'.format(page)) logger.flush() raise
def get_issue_events_repo(gith, path, repo, project_name, start_date, active_users): #Why Not get_events()? os.makedirs(path, exist_ok=True) exception_thrown = True while (exception_thrown): exception_thrown = False if 'issues_events_repo.csv' in os.listdir(path): issues_events_data = pandas.read_csv(path + '/issues_events_repo.csv', sep=',') else: issues_events_data = pandas.DataFrame( columns=['id', 'date', 'event', 'creator_login']) if 'issues_events_extraction.log' in os.listdir(path): last_issues_page, last_issue, last_events_page = get_last_page_read( path + '/issues_events_extraction.log') else: last_issues_page = 0 last_issue = '' last_events_page = 0 if 'events_extraction_completed_issues.csv' in os.listdir(path): completed_issues = pandas.read_csv( path + '/events_extraction_completed_issues.csv', sep=',') else: completed_issues = pandas.DataFrame(columns=['id']) logger = open(path + '/issues_events_extraction.log', 'a+') ### Get Other Issues Events try: issues_page = last_issues_page issue_id = '' page = 0 issues = repo.get_issues(state='all', sort='created_at', since=start_date) num_issues = issues.totalCount final_issues_page = int(num_issues / cfg.items_per_page) for issues_page in range(last_issues_page, final_issues_page + 1): cfg.waitRateLimit(gith) current_issues_page = issues.get_page(issues_page) for issue in current_issues_page: cfg.waitRateLimit(gith) issue_id = issue.id if (issue_id not in completed_issues.id.tolist()): if (issue_id != last_issue): last_page = 0 else: last_page = last_events_page cfg.waitRateLimit(gith) issue_events = issue.get_events() num_items = issue_events.totalCount final_page = int(num_items / cfg.items_per_page) for page in range(last_page, final_page + 1): cfg.waitRateLimit(gith) issues_events_page = issue_events.get_page(page) for event in issues_events_page: cfg.waitRateLimit(gith) event_id = event.id if (event_id not in issues_events_data.id.tolist()): if (event.actor): cfg.waitRateLimit(gith) actor_login = event.actor.login if (actor_login in active_users): cfg.waitRateLimit(gith) util.add(issues_events_data, [ event_id, event.created_at, event.event, actor_login ]) util.add(completed_issues, issue_id) if (len(issues_events_data) > 0): issues_events_data.to_csv(path + '/issues_events_repo.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') completed_issues.to_csv( path + '/events_extraction_completed_issues.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') os.rename(path + '/issues_events_repo.csv', path + '/complete_issues_events_repo.csv') print('{}: Issues Events Extraction Complete'.format(repo)) # except github.UnknownObjectException: # print('Exception Occurred While Getting ISSUES EVENTS: UnknownObject (Skipped)') # logger.write('last_issues_page:{},last_issue:{},last_event_page:{}\n'.format(issues_page, issue_id, page)) # logger.flush() # if(len(issues_events_data)>0): # issues_events_data.to_csv(path+'/issues_events_repo.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') # completed_issues.to_csv(path+'/events_extraction_completed_issues.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') # exception_thrown=True # pass except github.GithubException as ghe: print('Exception Occurred While Getting ISSUES EVENTS: Github') logger.write( 'last_issues_page:{},last_issue:{},last_event_page:{}\n'. format(issues_page, issue_id, page)) logger.flush() if str(ghe) == '500 None': print('PROBLEMS ON ISSUE: {} Excluded From Events Extraction'. format(issue_id)) util.add(completed_issues, issue_id) if (len(issues_events_data) > 0): issues_events_data.to_csv(path + '/issues_events_repo.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') completed_issues.to_csv( path + '/events_extraction_completed_issues.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') exception_thrown = True pass except requests.exceptions.Timeout: print('Exception Occurred While Getting ISSUES EVENTS: Timeout') logger.write( 'last_issues_page:{},last_issue:{},last_event_page:{}\n'. format(issues_page, issue_id, page)) logger.flush() if (len(issues_events_data) > 0): issues_events_data.to_csv(path + '/issues_events_repo.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') completed_issues.to_csv( path + '/events_extraction_completed_issues.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') exception_thrown = True pass except: print('Execution Interrupted While Getting ISSUES EVENTS') logger.write( 'last_issues_page:{},last_issue:{},last_event_page:{}\n'. format(issues_page, issue_id, page)) logger.flush() if (len(issues_events_data) > 0): issues_events_data.to_csv(path + '/issues_events_repo.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') completed_issues.to_csv( path + '/events_extraction_completed_issues.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') raise
def get_issues_prs(org_path, gith, repo, project_name, start_date, developer_login): path = org_path + '/' + project_name + '/Activities_Plots/' + developer_login os.makedirs(path, exist_ok=True) exception_thrown = True while (exception_thrown): exception_thrown = False logger = open(path + '/issues_pr_extraction.log', 'a+') ### Get Issue / Pull Requests created_issues_prs = repo.get_issues(state='all', sort='created_at', since=start_date, creator=developer_login) count_exception = True while (count_exception): count_exception = False try: num_items = created_issues_prs.totalCount except github.GithubException: print( 'Failed to get ISSUES/PRs Number from User {} and Project {} (TIMEOUT: Retrying)' .format(developer_login, project_name)) count_exception = True pass except requests.exceptions.Timeout: print( 'Failed to get ISSUES/PRs Number from User {} and Project {} (TIMEOUT: Retrying)' .format(developer_login, project_name)) count_exception = True pass except: print( 'Failed to get ISSUES/PRs Number from User {} and Project {} (Probably Empty)' .format(developer_login, project_name)) return last_page = int(num_items / cfg.items_per_page) last_page_read = 0 if 'issues_pr_creation.csv' in os.listdir(path): issues_prs_data = pandas.read_csv(path + '/issues_pr_creation.csv', sep=',') last_page_read = get_last_page_read_short( path + '/issues_pr_extraction.log') else: issues_prs_data = pandas.DataFrame( columns=['id', 'date', 'creator_login']) try: for page in range(last_page_read, last_page + 1): created_issues_prs_page = created_issues_prs.get_page(page) for issue in created_issues_prs_page: issue_id = issue.id if (issue_id not in issues_prs_data.id.tolist()): if (issue.user): cfg.waitRateLimit(gith) util.add( issues_prs_data, [issue_id, issue.created_at, issue.user.login]) logger.write('last_page_read:{}\n'.format(page)) logger.flush() if (len(issues_prs_data) > 0): issues_prs_data.to_csv(path + '/issues_pr_creation.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') print( '{}: Issues/Pulls Extraction Complete'.format(developer_login)) except github.GithubException: print('Exception Occurred While Getting ISSUES/PULLS: Github') logger.write('last_page_read:{}\n'.format(page)) logger.flush() if (len(issues_prs_data) > 0): issues_prs_data.to_csv(path + '/issues_pr_creation.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') exception_thrown = True pass except requests.exceptions.Timeout: print('Exception Occurred While Getting ISSUES/PULLS: Timeout') logger.write('last_page_read:{}\n'.format(page)) logger.flush() if (len(issues_prs_data) > 0): issues_prs_data.to_csv(path + '/issues_pr_creation.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') exception_thrown = True pass except: print('Execution Interrupted While Getting ISSUES/PULLS') logger.write('last_page_read:{}\n'.format(page)) logger.flush() if (len(issues_prs_data) > 0): issues_prs_data.to_csv(path + '/issues_pr_creation.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') raise
def get_pulls_comments_repo(gith, path, repo, project_name, start_date, active_users): os.makedirs(path, exist_ok=True) exception_thrown = True while (exception_thrown): exception_thrown = False if 'pulls_comments_repo.csv' in os.listdir(path): pulls_comments_data = pandas.read_csv(path + '/pulls_comments_repo.csv', sep=',') else: pulls_comments_data = pandas.DataFrame( columns=['id', 'date', 'creator_login']) if 'pulls_comments_extraction.log' in os.listdir(path): last_pulls_page, last_pull, last_comments_page = get_last_page_read( path + '/pulls_comments_extraction.log') else: last_pulls_page = 0 last_pull = '' last_comments_page = 0 if 'comments_extraction_completed_pulls.csv' in os.listdir(path): completed_pulls = pandas.read_csv( path + '/comments_extraction_completed_pulls.csv', sep=',') else: completed_pulls = pandas.DataFrame(columns=['id']) logger = open(path + '/pulls_comments_extraction.log', 'a+') ### Get Comments on Pull try: pulls_page = last_pulls_page pull_id = '' page = 0 pulls = repo.get_pulls(state='all', sort='created_at') num_pulls = pulls.totalCount final_pulls_page = int(num_pulls / cfg.items_per_page) for pulls_page in range(last_pulls_page, final_pulls_page + 1): cfg.waitRateLimit(gith) current_pulls_page = pulls.get_page(pulls_page) for pull in current_pulls_page: cfg.waitRateLimit(gith) pull_id = pull.id if (pull_id not in completed_pulls.id.tolist()): if (pull_id != last_pull): last_page = 0 else: last_page = last_comments_page cfg.waitRateLimit(gith) pulls_comments = pull.get_comments() num_items = pulls_comments.totalCount final_page = int(num_items / cfg.items_per_page) for page in range(last_page, final_page + 1): cfg.waitRateLimit(gith) pulls_comments_page = pulls_comments.get_page(page) for comment in pulls_comments_page: cfg.waitRateLimit(gith) comment_id = comment.id if (comment_id not in pulls_comments_data.id.tolist()): if (comment.user): cfg.waitRateLimit(gith) user_login = comment.user.login if (user_login in active_users): cfg.waitRateLimit(gith) util.add(pulls_comments_data, [ comment_id, comment.created_at, user_login ]) util.add(completed_pulls, pull_id) if (len(pulls_comments_data) > 0): pulls_comments_data.to_csv(path + '/pulls_comments_repo.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') completed_pulls.to_csv( path + '/comments_extraction_completed_pulls.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') os.rename(path + '/pulls_comments_repo.csv', path + '/complete_pulls_comments_repo.csv') print('{}: Pulls Comments Extraction Complete'.format(repo)) except github.GithubException as ghe: print('Exception Occurred While Getting PULLS COMMENTS: Github') logger.write( 'last_pulls_page:{},last_pull:{},last_comment_page:{}\n'. format(pulls_page, pull_id, page)) logger.flush() if str(ghe) == '500 None': print('PROBLEMS ON PULL: {} Excluded From Comments Extraction'. format(pull_id)) util.add(completed_pulls, pull_id) if (len(pulls_comments_data) > 0): pulls_comments_data.to_csv(path + '/pulls_comments_repo.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') completed_pulls.to_csv( path + '/comments_extraction_completed_pulls.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') exception_thrown = True pass except requests.exceptions.Timeout: print('Exception Occurred While Getting PULLS COMMENTS: Timeout') logger.write( 'last_pulls_page:{},last_pull:{},last_comment_page:{}\n'. format(pulls_page, pull_id, page)) logger.flush() if (len(pulls_comments_data) > 0): pulls_comments_data.to_csv(path + '/pulls_comments_repo.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') completed_pulls.to_csv( path + '/comments_extraction_completed_pulls.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') exception_thrown = True pass except: print('Execution Interrupted While Getting PULLS COMMENTS') logger.write( 'last_pulls_page:{},last_pull:{},last_comment_page:{}\n'. format(pulls_page, pull_id, page)) logger.flush() if (len(pulls_comments_data) > 0): pulls_comments_data.to_csv(path + '/pulls_comments_repo.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') completed_pulls.to_csv( path + '/comments_extraction_completed_pulls.csv', sep=',', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.') raise
main_project_name = p_names[chosen_project] path = main_path + '/' + chosen_organization + '/' + main_project_name with open(path + '/inactivity_interval_list.csv', 'r') as f: #opens PW file inactivity_intervals_data = [ list(map(str, rec)) for rec in csv.reader(f, delimiter=',') ] all_devs = len(inactivity_intervals_data) row = [main_project_name, all_devs] for threshold in range(10, 90, 10): num = getNumCoreDevelopers(inactivity_intervals_data, threshold) row.append(num) util.add(table, row) table.to_csv(main_path + '/breaks_number_filter_stats.csv', sep=';', na_rep='NA', header=True, index=False, mode='w', encoding='utf-8', quoting=None, quotechar='"', line_terminator='\n', decimal='.')
#Read Break Dates Table with open(super_path+'/break_dates_list.csv', 'r') as f: break_dates_data = [list(map(str,rec)) for rec in csv.reader(f, delimiter=',')] breaks_df = pandas.DataFrame({'durations' : inactivity_intervals_data, 'datelimits' : break_dates_data}) # FILTER DEVELOPERS active_users_breaks = pandas.DataFrame(columns=['durations','datelimits']) path = (super_path+'/'+project_name) for index, row in breaks_df.iterrows(): num_breaks=len(row['durations'])-3 if (row['durations'][0] in active_users): util.add(active_users_breaks, row) num_all_users = len(inactivity_intervals_data) num_active_users = len(active_users_breaks) active_users_longer_intervals=[] active_devs_sleeping_intervals_df = [] active_devs_hibernation_intervals_df = [] active_devs_dead_intervals_df = [] n=0 for index, row in active_users_breaks.iterrows(): user_id=row['durations'][0] last_commit_day=util.getLastCommitDay(commit_table, user_id) last_break_length=util.days_between(last_commit_day, project_end)