Exemplo n.º 1
0
def get_dataframe_from_repo(repo, num=100):
    """Create pandas dataframe of contributors by country.

    Args:
        repo - a full GitHub repo URL
        num - number of contributors to analyze per repo

    Returns:
        df - a pandas dataframe of contributors by country
        num_contributors - total number of contributors
    """
    # get contributors
    repo_ending_string = extract_github_owner_and_repo(repo)
    contributors = get_contributors(repo_ending_string, num)
    num_contributors = len(contributors)

    # get count of countries
    country_list = []
    for contributor in contributors:
        location = get_contributor_location(contributor)
        country = get_country_from_location(location)
        country_list.append(country)
    country_counter = Counter(country_list)

    # convert counter to pandas dataframe
    df = pd.DataFrame.from_records(country_counter.most_common(),
                                   columns=["country", "contributor_count"])

    return df, num_contributors
Exemplo n.º 2
0
def print_by_contributor(contributors, pypi_data=None):
    """
    Print location results by contributor

    Args:
        contributors - a list of contributors

    Returns:
        null
    """
    print("CONTRIBUTOR, LOCATION")
    if pypi_data is not None:
        print("* indicates PyPI maintainer")
    print("---------------------")
    for contributor in contributors:
        location = get_contributor_location(contributor)
        country = get_country_from_location(location)
        try:
            # Check if pypi_data is not None, indicating a PyPI package scan
            if pypi_data is not None and contributor in pypi_data[
                    "pypi_maintainers"]:
                print(contributor, "*", "|", location, "|", country)
            else:
                print(contributor, "|", location, "|", country)
        except UnicodeEncodeError:
            print(contributor, "| error")
Exemplo n.º 3
0
 def test_get_country_from_location_standard_order_with_comma(self):
     """test get_country_from_location on standard order pairs with comma."""
     assert get_country_from_location(
         "Wellington, New Zealand") == "New Zealand"
     assert get_country_from_location(
         "Jordan, Minnesota") == "United States"
     assert get_country_from_location("Jordan, MN") == "United States"
     assert get_country_from_location("Atlanta, Georgia") == "United States"
     assert get_country_from_location("Atlanta, Ga") == "United States"
     assert get_country_from_location("London, England") == "United Kingdom"
     assert get_country_from_location(
         "Prague, Czech Republic") == "Czech Republic"
     assert get_country_from_location("Virginia, USA") == "United States"
Exemplo n.º 4
0
 def test_get_country_from_location_world_cities(self):
     """test get_country_from_location on world city names."""
     assert get_country_from_location("Tokyo") == "Japan"
     assert get_country_from_location("London") == "United Kingdom"
     assert get_country_from_location("Jakarta") == "Indonesia"
     assert get_country_from_location("Beijing") == "China"
     assert get_country_from_location("Washington D.C.") == "United States"
     assert get_country_from_location("Toronto, ON") == "Canada"
Exemplo n.º 5
0
def print_by_contributor(software_name,
                         contributors,
                         output_csv=False,
                         pypi_data=None):
    """Print location results by contributor.

    Print contributors and countries to terminal window. If output csv is set
    to true, then also output results to a csv file.

    Args:
        software_name - name of package or repo
        contributors - a list of contributors
        output_csv - whether to output a csv.
        pypi_data - a pypi data object.

    Returns:
        null
    """
    # create csv if output_csv specified
    if output_csv:
        # unique current time timestamp to create unique filename
        timestamp = time.strftime("%Y%m%d-%H%M%S")
        create_csv("contributor", timestamp)

    print("CONTRIBUTOR, LOCATION")
    if pypi_data is not None:
        print("* indicates PyPI maintainer")
    print("---------------------")
    for contributor in contributors:
        location = get_contributor_location(contributor)
        country = get_country_from_location(location)
        if output_csv:
            add_committer_to_csv("contributor", software_name, timestamp,
                                 contributor, location, country)
        try:
            # Check if pypi_data is not None, indicating a PyPI package scan
            if pypi_data is not None and contributor in pypi_data[
                    "pypi_maintainers"]:
                print(contributor, "*", "|", location, "|", country)
            else:
                print(contributor, "|", location, "|", country)
        except UnicodeEncodeError:
            print(contributor, "| error")
Exemplo n.º 6
0
def print_by_country(contributors):
    """
    Print contributors aggregated by country

    Args:
        contributors: a list of contributors

    Returns:
    null
    """
    print("COUNTRY | # OF CONTRIBUTORS")
    print("---------------------------")
    country_list = []
    for contributor in contributors:
        location = get_contributor_location(contributor)
        country = get_country_from_location(location)
        country_list.append(country)

    country_counter = Counter(country_list)
    for country, count in country_counter.most_common():
        print(country, count)
def scan_multiple_repos(input_file="repos.txt", num=100):
    """Create csv of data for multiple repos.

    Scan through repos provided in repos.txt and create a single csv that
    stores all contributor-related data for each contributor in each repo.

    Args:
        input_file - file containing repo list
        num - max number of contributors to analyze per repo

    Returns:
        None
    """
    # create csv to store multi-repo scan results
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    create_csv("multirepo", timestamp)

    # open file that contains repos to scan and append contributors for each
    # repo to csv. Also, repos.txt must contain repo names, one repo per line.
    with open(input_file, "r") as input_repos:
        for repo in input_repos:
            # Skip blank lines
            if repo == "":
                continue
            # strip blank space before extracting owner and repo name
            repo_ending_string = extract_github_owner_and_repo(repo.strip())
            contributors = get_contributors(repo_ending_string, num)
            for contributor in contributors:
                location = get_contributor_location(contributor)
                country = get_country_from_location(location)
                add_committer_to_csv(
                    "multirepo",
                    repo_ending_string,
                    timestamp,
                    contributor,
                    location,
                    country,
                )
Exemplo n.º 8
0
 def test_get_country_from_location_dataset_pull_geographies(self):
     """tests of get_gountry_from_location() that fail as of 2/14/2021"""
     assert get_country_from_location("Saclay") == "France"
     assert get_country_from_location("Warszawa") == "Poland"
     assert get_country_from_location("brookline, ma") == "United States"
     assert get_country_from_location(
         "Greater Los Angeles Area") == "United States"
     assert get_country_from_location("Forschungszentrum") == "Germany"
     assert get_country_from_location("Montigny-lès-Metz") == "France"
     assert get_country_from_location(
         "roudnice nad labem, czech republic") == "Czech Republic"
     assert get_country_from_location("Berlin/Florence") == "Germany"
     assert get_country_from_location(
         "Greater Seattle Area") == "United States"
     assert get_country_from_location(
         "Flanders, Europe, Earth") == "Belgium"
     assert get_country_from_location("Wrocław") == "Poland"
Exemplo n.º 9
0
 def test_get_country_from_location_corner_case_geographies(self):
     """test get_country_from_location on unusual geographies."""
     assert get_country_from_location("Palestine") == "Palestine"
     assert get_country_from_location(
         "San Francisco Bay Area") == "United States"
     assert get_country_from_location("EU") == "None"
     assert get_country_from_location("Canary Islands") == "Spain"
     assert get_country_from_location("Earth") == "None"
     assert get_country_from_location("Sydney") == "Australia"
     assert get_country_from_location("Amsterdam") == "Netherlands"
     assert get_country_from_location("NYC") == "United States"
     assert get_country_from_location("Barcelona") == "Spain"
     assert get_country_from_location("Kerala") == "India"
     assert get_country_from_location("Hyderabad") == "India"
     assert get_country_from_location("Vancouver") == "Canada"
     assert get_country_from_location("Jiangxi") == "China"
     assert get_country_from_location("San Francisco") == "United States"
     assert get_country_from_location("New York") == "United States"
     assert get_country_from_location("Saint Petersburg") == "Russia"
     assert get_country_from_location("England") == "United Kingdom"
     assert get_country_from_location("Athens") == "Greece"
     assert get_country_from_location("Europe") == "None"
     assert get_country_from_location("Lima") == "Peru"
     assert get_country_from_location("Bay Area") == "United States"
     assert get_country_from_location("EU") == "None"
     assert get_country_from_location("Canary Islands") == "Spain"
     assert get_country_from_location("waterloo") == "United Kingdom"
     assert get_country_from_location("Europe/Berlin") == "None"
     assert get_country_from_location("York") == "United Kingdom"
     assert get_country_from_location("München") == "Germany"
     assert get_country_from_location("Montreal, CA") == "Canada"
     assert get_country_from_location("Florianópolis") == "Brazil"
     assert get_country_from_location("Montréal") == "Canada"
     assert get_country_from_location("Bangalore") == "India"
     assert get_country_from_location("Dublin") == "Ireland"
     assert get_country_from_location(
         "Santiago de Querétaro, México") == "Mexico"
     assert get_country_from_location("Jülich") == "Germany"
     assert get_country_from_location("Victoria, BC") == "Canada"
     assert get_country_from_location("Waterloo, ON") == "Canada"
     assert get_country_from_location(
         "Falls Church, Virginia") == "United States"
     assert get_country_from_location(
         "Amsterdam, the Netherlands") == "Netherlands"
     assert get_country_from_location("BeiJing") == "China"
     assert get_country_from_location(
         "Edinburgh, Scotland") == "United Kingdom"
     assert get_country_from_location("Medellín, Colombia") == "Colombia"
     assert get_country_from_location("La Jolla, CA.") == "United States"
     assert get_country_from_location("beijing") == "China"
     assert get_country_from_location(
         "Pemberton, British Columbia") == "Canada"
     assert get_country_from_location("Timi»ôoara") == "Romania"
     assert get_country_from_location("PRC") == "China"
     assert get_country_from_location(
         "Amsterdam, The Netherlands") == "Netherlands"
     assert get_country_from_location("Oxford") == "United Kingdom"
     assert get_country_from_location("S√£o Paulo") == "Brazil"
     assert get_country_from_location("Kyiv") == "Ukraine"
     assert get_country_from_location("Vancouver, BC") == "Canada"
     assert get_country_from_location("N.H.") == "United States"
     assert get_country_from_location("Sri-City, Andhra Pradesh") == "India"
     assert get_country_from_location("Scotland") == "United Kingdom"
     assert get_country_from_location("Geneva") == "Switzerland"
     assert get_country_from_location(
         "Rotterdam, the Netherlands") == "Netherlands"
     assert get_country_from_location("Milan") == "Italy"
     assert get_country_from_location("Republic of Korea") == "South Korea"
     assert get_country_from_location("Brasília, Brazil.") == "Brazil"
     assert get_country_from_location("beijing") == "China"
     assert get_country_from_location("Zürich") == "Switzerland"
     assert get_country_from_location("Kitchener, Ontario") == "Canada"
     assert get_country_from_location("Montréal, QC") == "Canada"
     assert get_country_from_location(
         "Glasgow, Scotland") == "United Kingdom"
     assert (get_country_from_location(
         "28 rue du Dr Roux 75015 Paris, FRANCE") == "France")
     assert get_country_from_location("Kraków") == "Poland"
     assert get_country_from_location("İstanbul") == "Turkey"
     assert get_country_from_location("Russian Federation") == "Russia"
     assert get_country_from_location("Newcastle, NSW") == "Australia"
     assert get_country_from_location("Australia, Victoria") == "Australia"
     assert get_country_from_location(
         "Perth, Western Australia ") == "Australia"
     assert get_country_from_location("Gdańsk") == "Poland"
     assert get_country_from_location("SF") == "United States"
     assert get_country_from_location("Hyderabad (India)") == "India"
     assert get_country_from_location("BITS Pilani, Rajasthan") == "India"
     assert get_country_from_location("Sri-City, Andhra Pradesh") == "India"
Exemplo n.º 10
0
 def test_get_country_from_location_country_abbreviations(self):
     """test get_country_from_location on country abbreviations."""
     assert get_country_from_location("USA") == "United States"
     assert get_country_from_location("Cambridge, UK") == "United Kingdom"
     assert get_country_from_location("UK") == "United Kingdom"
Exemplo n.º 11
0
 def test_get_country_from_location_standard_order_no_comma(self):
     """test get_country_from_location on standard order pairs without comma."""
     assert get_country_from_location("Menlo Park CA") == "United States"
Exemplo n.º 12
0
 def test_get_country_from_location_nonstandard_order(self):
     """test get_country_from_location on non-standard order pairs."""
     assert get_country_from_location("Russia, Moscow") == "Russia"
     assert get_country_from_location("Russia, Nizhny Novgorod") == "Russia"
Exemplo n.º 13
0
 def test_get_country_from_location_standard_order_with_comma(self):
     """test get_country_from_location on standard order pairs with comma."""
     assert get_country_from_location(
         "Wellington, New Zealand") == "New Zealand"
     assert get_country_from_location(
         "Jordan, Minnesota") == "United States"
     assert get_country_from_location("Jordan, MN") == "United States"
     assert get_country_from_location("Atlanta, Georgia") == "United States"
     assert get_country_from_location("Atlanta, Ga") == "United States"
     assert get_country_from_location("London, England") == "United Kingdom"
     assert get_country_from_location(
         "Prague, Czech Republic") == "Czech Republic"
     assert get_country_from_location("Virginia, USA") == "United States"
     assert get_country_from_location("Naperville, IL") == "United States"
     assert get_country_from_location(
         "Toronto, Ontario, Canada") == "Canada"
     assert get_country_from_location("Berlin, DE") == "Germany"
     assert get_country_from_location("CSU Sacramento") == "United States"
     assert get_country_from_location("Philadelphia, PA") == "United States"
Exemplo n.º 14
0
 def test_get_country_from_location_corner_case_geographies(self):
     """test get_country_from_location on unusual geographies."""
     assert get_country_from_location("Palestine") == "Palestine"
     assert get_country_from_location(
         "San Francisco Bay Area") == "United States"