def get_dataframe_from_repo(repo, num=100): """Create pandas dataframe of contributors by country. Args: repo - a full GitHub repo URL num - number of contributors to analyze per repo Returns: df - a pandas dataframe of contributors by country num_contributors - total number of contributors """ # get contributors repo_ending_string = extract_github_owner_and_repo(repo) contributors = get_contributors(repo_ending_string, num) num_contributors = len(contributors) # get count of countries country_list = [] for contributor in contributors: location = get_contributor_location(contributor) country = get_country_from_location(location) country_list.append(country) country_counter = Counter(country_list) # convert counter to pandas dataframe df = pd.DataFrame.from_records(country_counter.most_common(), columns=["country", "contributor_count"]) return df, num_contributors
def scan_single_repo(repo, summary, output_csv, num=100): """Print location results for single GitHub repository. Printing can either be by contributor or by country. Output can optionally be stored as a csv. Args: repo - URL of repo summary - whether to print results by country, i.e. summary. output_csv - whether to store output in csv (default: false) num - max number of contributors to analyze Returns: null """ repo_ending_string = extract_github_owner_and_repo(repo) contributors = get_contributors(repo_ending_string, num) print("-----------------") print("GITHUB REPO: {}".format(repo_ending_string)) print("-----------------") if summary: print_by_country(contributors) else: print_by_contributor(repo_ending_string, contributors, output_csv)
def test_print_by_country(capsys): """Unit test for print_by_country() for networml python package.""" repo = "https://www.github.com/iqtlabs/networkml" repo_ending_string = extract_github_owner_and_repo(repo) contributors = get_contributors(repo_ending_string) print_by_country(contributors) captured = capsys.readouterr() # capture outpt printed to date # dedent removes spacing, using the spacing width from the first line output_text = textwrap.dedent(""" COUNTRY | # OF CONTRIBUTORS --------------------------- None 11 United States 4 New Zealand 2\n""") assert captured.out == output_text
def scan_single_repo(repo, summary): """Print location results for single GitHub repository Printing can either be by contributor or by country Args: repo - URL of repo on GitHub Returns: null """ repo_ending_string = extract_github_owner_and_repo(repo) contributors = get_contributors(repo_ending_string) print("-----------------") print("GITHUB REPO: {}".format(repo_ending_string)) print("-----------------") if summary: print_by_country(contributors) else: print_by_contributor(contributors)
def scan_multiple_repos(input_file="repos.txt", num=100): """Create csv of data for multiple repos. Scan through repos provided in repos.txt and create a single csv that stores all contributor-related data for each contributor in each repo. Args: input_file - file containing repo list num - max number of contributors to analyze per repo Returns: None """ # create csv to store multi-repo scan results timestamp = time.strftime("%Y%m%d-%H%M%S") create_csv("multirepo", timestamp) # open file that contains repos to scan and append contributors for each # repo to csv. Also, repos.txt must contain repo names, one repo per line. with open(input_file, "r") as input_repos: for repo in input_repos: # Skip blank lines if repo == "": continue # strip blank space before extracting owner and repo name repo_ending_string = extract_github_owner_and_repo(repo.strip()) contributors = get_contributors(repo_ending_string, num) for contributor in contributors: location = get_contributor_location(contributor) country = get_country_from_location(location) add_committer_to_csv( "multirepo", repo_ending_string, timestamp, contributor, location, country, )
def test_extract_github_owner_and_repo(self): """Unit test for extract_github_owner_and_repo().""" owner_and_repo = extract_github_owner_and_repo( "www.github.com/psf/requests") assert owner_and_repo == "psf/requests"