예제 #1
0
def addEthnicityFields(df, namefield):
    # https://pypi.org/project/ethnicolr/#description
    import ethnicolr
    # use only the last word of the field for analysis
    df['ethname'] = df[namefield].transform(lambda t: t.split()[-1])
    # convert using library function
    df = ethnicolr.census_ln(df, 'ethname')
    # drop the temporary column
    df = df.drop(columns=['ethname'])

    newfields = [
        'pctwhite', 'pctblack', 'pctapi', 'pctaian', 'pct2prace', 'pcthispanic'
    ]
    for fieldname in newfields:
        df[fieldname] = pd.to_numeric(df[fieldname].astype(str),
                                      errors='coerce').astype(float)

    return df
예제 #2
0
def run_census_last(subset_df, census_year):
    """
    This function takes a dataframe of teacher information and 
    runs the Census Ln Function. It provides the proportion of given
    last name that was registered as someone who was "white" during 
    the 2010 United States Census.

    Input:
    	- subset_df: a dataframe that is a subset of teacher information
    Output:
    	- df: a dataframe with proportion that the last name was "white"
    	during the 2010 Census
    """
    has_last_name_df = subset_df[subset_df.teacher_last.notnull()].copy()
    df = census_ln(has_last_name_df, 'teacher_last', census_year)

    # # keep the relevant columns
    # cols_to_keep = ['pctwhite']
    # df = df[cols_to_keep]

    # # fill NaNs w/ 50%
    # df.fillna(value=float(50), axis=1, inplace=True)

    return df
#!/usr/bin/python
# -*- coding: utf-8 -*-

import pandas as pd

from ethnicolr import census_ln, pred_census_ln

names = [{'name': 'smith'}, {'name': 'zhang'}, {'name': 'jackson'}]

df = pd.DataFrame(names)

print(df)

print(census_ln(df, 'name'))

print(census_ln(df, 'name', 2010))

print(pred_census_ln(df, 'name'))
native_american = 0
two_race = 0
df = []

if not os.path.exists('ethnicity.pkl'):

    with open('full.json', 'r') as tweets_file:

        for idx, line in enumerate(tweets_file):

            try:

                if idx % 10000 == 0 and idx != 0:
                    print(idx)
                    df = pd.DataFrame(df)
                    classed = census_ln(df, 'name')
                    classed = classed.dropna()
                    classed = classed.drop(['name'], axis=1)
                    classed = classed.replace('(S)', 0)
                    classed = classed.astype('float64')
                    classed = classed.divide(100)
                    white += float(classed['pctwhite'].sum())
                    black += float(classed['pctblack'].sum())
                    asian += float(classed['pctapi'].sum())
                    native_american += float(classed['pctaian'].sum())
                    two_race += float(classed['pct2prace'].sum())
                    hispanic += float(classed['pcthispanic'].sum())
                    df = []

                tweet = json.loads(line)
                name = tweet['user']['name']
예제 #5
0
def run_census_ln(subset_df, census_year):
    """Run the Census Ln Function."""
    has_last_name_df = subset_df[subset_df.contributor_lname.notnull()].copy()
    return census_ln(has_last_name_df, 'contributor_lname', census_year)