from datamanager.filemanager import FileManager from properties import dataFolderPath import pandas as pd import numpy as np import os import re # dataFolderPath = '/Users/georgia/Desktop/Thesis/DataUser/datasets' fm = FileManager() def remove_outliers(data, percentile): new_data = [] limit = np.nanpercentile(data, percentile) for item in data: if item <= limit: new_data.append(item) return new_data def normalise_data(data): data_normalised = [] new_data_df = pd.DataFrame(data) for item in data: x = ((item - new_data_df.min()) / (new_data_df.max() - new_data_df.min())) * 100 data_normalised.append(x[0]) return data_normalised
import os import sys import traceback import requests import json from logger.downloadlogger import Logger from datamanager.dbmanager import DBManager from datamanager.filemanager import FileManager from downloader.gitdownloader import GitDownloader from downloader.githubdownloader import GithubDownloader #from list_of_repos_urls import List_of_repos_urls from properties import GitHubAuthToken, dataFolderPath, gitExecutablePath, verbose fm = FileManager() db = DBManager() lg = Logger(verbose) ghd = GithubDownloader(GitHubAuthToken) gd = GitDownloader(gitExecutablePath, lg) ''' Haven't implemented checking if comments exist already nor step_action. This e ''' #user_api_address = "https://api.github.com/users/" + '/'.join(user_address.split('/')[-1:]) #user_name = '_'.join(user_address.split('/')[-1:]) user_name = "nbriz" db.initialize_write_to_disk(user_name) project = db.read_project_from_disk(user_name)
- project popularity stats - watchers (mean) - project scale stats - commits (mean) - project scale stats - contributors (mean) - project scale stats - releases (mean) - issue_assigned_to_user_and_closed_by_user_time_diff - check seconds (mean) - issue_created_by_user_closed_by_user_time_diff - check seconds (mean) - time_diff_between_consequtive_commiits_committed_by_user - check seconds (mean) - pull merge time diff ''' # dataFolderPath = '/Users/georgia/Desktop/Thesis/DataUser/datasets' fm = FileManager() stats = fm.read_stats_jsons_from_folder( os.path.join(dataFolderPath, "datasets")) def remove_outliers(data, percentile): new_data = [] limit = np.nanpercentile(data, percentile) for item in data: if item <= limit: new_data.append(item) return new_data
import json from properties import GitHubAuthToken, dataFolderPath, gitExecutablePath, verbose, packageFolderPath from downloader.githubdownloader import GithubDownloader from datamanager.filemanager import FileManager ''' Here I download the profile links of the users that have more than ten public repos and more than 20 followers. These users will be used to do the benchmarking ''' ghd = GithubDownloader(GitHubAuthToken) fm = FileManager() users = "https://api.github.com/search/users?q=repos:10+followers:20" logins = [] final_list = [] for user in ghd.download_paginated_object(users): logins.append(user["html_url"]) for item in logins: name = '_'.join(item.split('/')[-1:]) try: stats = fm.read_json_from_file(dataFolderPath + "/" + name + "/user_stats.json") if stats["commit_authored"]>100: final_list.append(item) except: continue
from datasetcreator.project_preferences import Project_preferences #from datasetcreator.response_time import response_time_to_comments_mentioned from datasetcreator.testing import add_test_case, test_comments, closed_issues from datasetcreator.commits import files_in_commits, commit_changes, empty_commit_message, bug_fixing_contribution from datasetcreator.operational import documentation_comments, documentation_commit #user_address = "https://github.com/nbriz" #user_api_address = "https://api.github.com/users/" + '/'.join(user_address.split('/')[-1:]) user_name = 'nbriz' dataFolderPath = '/Users/georgia/Desktop' ''' This is a file to run to gather a first example of all possible raw data that can be dowloaded ''' productivity = Productivity(dataFolderPath, user_name) fm = FileManager() cm = Communication() lan = Languages() pm = Project_management(dataFolderPath, user_name) pp = Project_preferences() lg = Logger(verbose) gd = GitDownloader(gitExecutablePath, lg) lg.start_action("Retrieving user data ...", 29) contribution_days = productivity.contribution_days(dataFolderPath, user_name) fm.write_json_to_file( dataFolderPath + "/" + user_name + "/all_data/contribution_days.json", contribution_days) print("contribution_days done") lg.step_action()
from datamanager.filemanager import FileManager from list_of_repos_urls import List_of_repos_urls from helpers import get_number_of, print_usage, read_file_in_lines, get_total_count from properties import GitHubAuthToken, dataFolderPath, gitExecutablePath, verbose, \ download_commits_authored, download_commits_committed, download_issues_assigned, \ download_issues_authored, download_issues_mentions, download_issues_commented, \ download_issues_owened, download_repositories_owned, download_user_repos, download_issues_owened_full, \ download_issues_commented_full, download_issues_mentions_full, download_issues_authored_full, \ download_issues_assigned_full, download_commits_committed_full, download_commits_authored_full, download_issue_comments, \ download_commit_comments db = DBManager() lg = Logger(verbose) ghd = GithubDownloader(GitHubAuthToken) gd = GitDownloader(gitExecutablePath, lg) fm = FileManager() def download_information(user_address): ''' Downloads all the data of a user given its GitHub URL. :param user_address: the URL of the user of which the data are downloaded. ''' user_api_address = "https://api.github.com/users/" + '/'.join( user_address.split('/')[-1:]) user_name = '_'.join(user_address.split('/')[-1:]) db.initialize_write_to_disk(user_name)
from datamanager.filemanager import FileManager from properties import dataFolderPath import pandas as pd import numpy as np import json import os # dataFolderPath = '/Users/georgia/Desktop/Thesis/DataUser/datasets' fm = FileManager() stats = fm.read_stats_jsons_from_folder( os.path.join(dataFolderPath, "datasets")) profile_choices = { "A": "Ascending", "D": "Descending", "M": "Middle", "U": "Unclassified", "E": "Not Included" } category_choices = { "A": "Main_Page", "B": "Productivity", "C": "Dev_Productivity", "D": "Ops_Productivity", "E": "Project_Management", "F": "Quality&Testing" } ''' Ascending profile: The higher the better Descending profile: The lower the better Middle profile: Either extreme is bad
from properties import dataFolderPath from datamanager.filemanager import FileManager import json import os from analysis import to_day_hour_min_sec import time import calendar import operator from collections import OrderedDict ''' Create the view files for the UI ''' fm = FileManager() datasets = fm.read_stats_jsons_from_folder( os.path.join(dataFolderPath, "datasets", "final_datasets")) model = fm.read_json_from_file( os.path.join(dataFolderPath, "datasets", "model", "model.json")) # datasets = fm.read_stats_jsons_from_folder(os.path.join(dataFolderPath,"datasets", "final_datasets")) for user in datasets: view = {} #initialising view["scores"] = [] view["stats"] = {} view["timeactive"] = {} view["projects"] = {} view["projects"]["labels"] = ["Owner", "Contributor"] view["languages"] = []