def main():
    

    pd.set_option('display.expand_frame_repr', False)
    pd.set_option('display.precision', 2)



    #to download we must identify the tables containing the variables interest to us.
    #use ACS documentation, in particular Table Shells (https://www.census.gov/programs-surveys/acs/technical-documentation/summary-file-documentation.html)
    #can use cenusdata.search to find given text patterns. We can limit the output to the relevenant variables

    censusdata.search('acs5', 2015, 'label', 'unemploy')[160:170]
    censusdata.search('acs5', 2015, 'concept', 'education')[730:790]



    #using censusdata.printtable to show vars in table

    censusdata.printtable(censusdata.censustable('acs5', 2015, 'B23025'))
    censusdata.printtable(censusdata.censustable('acs5', 2015, 'B15003'))



    #after getting relevant variables, we need to identify the geographies.
    #we are going to get block groups in Cook County IL
    #1. look for FIPS code
    #2. find identifiers for all counties within IL to find Cook

    #1
    #print(censusdata.geographies(censusdata.censusgeo([('state','*')]), 'acs5', 2015)) #IL is 17

    #2
    #print(censusdata.geographies(censusdata.censusgeo([('state','17'), ('county', '*')]), 'acs5', 2015)) #cook is 031




    #once we have identified variables and geos of interest,
    #we can download the data using censusdata.download. compute variables for the percent unemployed and the percent w no hs degree

    cook_cnty = censusdata.download('acs5', 2015, censusdata.censusgeo([('state','17'), ('county','031'), ('block group','*')]), ['B23025_003E', 'B23025_005E', 'B15003_001E', 'B15003_002E', 'B15003_003E','B15003_004E', 'B15003_005E', 'B15003_006E', 'B15003_007E', 'B15003_008E','B15003_009E', 'B15003_010E', 'B15003_011E', 'B15003_012E', 'B15003_013E','B15003_014E', 'B15003_015E', 'B15003_016E'])
    cook_cnty['percent_unemployed'] = cook_cnty.B23025_005E / cook_cnty.B23025_003E * 100

    cook_cnty['percent_nohs'] = (cook_cnty.B15003_002E + cook_cnty.B15003_003E + cook_cnty.B15003_004E + cook_cnty.B15003_005E + cook_cnty.B15003_006E + cook_cnty.B15003_007E + cook_cnty.B15003_008E + cook_cnty.B15003_009E + cook_cnty.B15003_010E + cook_cnty.B15003_011E + cook_cnty.B15003_012E + cook_cnty.B15003_013E + cook_cnty.B15003_014E + cook_cnty.B15003_015E + cook_cnty.B15003_016E) / cook_cnty.B15003_001E * 100



    cook_cnty = cook_cnty[['percent_unemployed', 'percent_nohs']]
    print(cook_cnty.describe())


    #to show the 30 block groups in cook w highest rate of unemployment and the percent w no hs degree
    print(cook_cnty.sort_values('percent_unemployed', ascending=False).head(30))

    #show correlation
    print(cook_cnty.corr())

    censusdata.exportcsv('cook_data.csv', cook_cnty)
Пример #2
0
 def test_search(self):
     censusdata.search('acs5', 2015, 'concept', 'unweighted sample')
     censusdata.search('acs5', 2018, 'concept', 'SEX BY AGE')
     censusdata.search(
         'acs5', 2015, 'concept',
         lambda value: re.search('unweighted sample', value, re.IGNORECASE)
         and re.search('housing', value, re.IGNORECASE))
     censusdata.search('sf1', 2010, 'concept', 'JUVENILE FACILITIES')
     censusdata.search('acsse', 2019, 'concept', 'SEX BY AGE')
Пример #3
0
def GetFieldList(table, year):
    year = alt_search_year(year)
    table = str(table).upper()

    cl = cd.search('acs5', year, 'concept', table)
    gl = cd.search('acs5', year, 'group', table)

    fl = cl + gl

    fields = [f for f in fl if f[0].split("_")[0] == table and f[0][-1] == 'E']

    field_list = ["{0} {1}".format(f[0], f[2]) for f in fields]

    return field_list
def GetFieldList(table, year):
    """
    Returns a list of all fields for a particular table ID
    
    Args:
        table (str): Table ID
        year (int): ACS year"""

    year = alt_search_year(year)
    table = str(table).upper()

    cl = cd.search('acs5', year, 'concept', table)
    gl = cd.search('acs5', year, 'group', table)

    fl = cl + gl

    fields = [f for f in fl if f[0].split("_")[0] == table and f[0][-1] == 'E']

    field_list = ["{0} {1}".format(f[0], f[2]) for f in fields]

    return field_list
def main(verbose=False, data_dir='../data/'):
    if verbose:
        pd.set_option('display.expand_frame_repr', False)
        pd.set_option('display.precision', 2)

        print("Available race variables:")
        print(censusdata.search('acs5', 2015, 'label', 'race'))
        print("Table to download:")
        censusdata.printtable(censusdata.censustable('acs5', 2015, 'B02001'))

    variables = list(censusdata.censustable('acs5', 2015, 'B02001').keys())
    # remove variables for margin of errors
    variables = list(filter(lambda x: x[-1] != 'M', variables))
    if verbose:
        print("Variables:")
        print(variables)

    illinois_demo = censusdata.download(
        'acs5', 2015, censusdata.censusgeo([('state', '17'), ('tract', '*')]),
        variables)

    illinois_demo.rename(
        {
            'B02001_001E': 'total',
            'B02001_002E': 'white',
            'B02001_003E': 'black',
            'B02001_004E': 'native',
            'B02001_005E': 'asian',
            'B02001_006E': 'pacific',
            'B02001_007E': 'other',
            'B02001_008E': 'two_or_more',
            'B02001_009E': 'two_or_more_including_other',
            'B02001_010E': 'two_or_more_excluding_other'
        },
        axis='columns',
        inplace=True)

    illinois_demo.other = illinois_demo.other + \
        illinois_demo['two_or_more_including_other'] + \
        illinois_demo['two_or_more_excluding_other']

    illinois_demo = illinois_demo[[
        'total', 'white', 'black', 'native', 'asian', 'pacific', 'other'
    ]]
    total = illinois_demo.total
    illinois_demo.white /= total
    illinois_demo.black /= total
    illinois_demo.native /= total
    illinois_demo.asian /= total
    illinois_demo.pacific /= total
    illinois_demo.other /= total

    illinois_demo['censusgeo'] = illinois_demo.index
    illinois_demo.reset_index(level=0, drop=True, inplace=True)

    illinois_demo['tract'] = illinois_demo['censusgeo'].apply(
        lambda x: x.geo[2][1]).astype(str)
    illinois_demo['county'] = illinois_demo['censusgeo'].apply(
        lambda x: x.geo[1][1])
    illinois_demo['county_name'] = illinois_demo['censusgeo'].apply(
        lambda x: x.name.split(',')[1][1:-7])
    illinois_demo.drop('censusgeo', axis='columns', inplace=True)

    if verbose:
        print(illinois_demo.sample(frac=10 / len(illinois_demo)))
        print(illinois_demo.describe())

    illinois_demo = illinois_demo.loc[illinois_demo.county_name == 'Cook']
    illinois_demo.to_csv(data_dir + 'Illinois2015CensusTractsDemographics.csv')
    print("Successfully downloaded Illinois demographic data.")

    url = "https://github.com/uscensusbureau/citysdk/raw/master/v2/GeoJSON/500k/2015/17/tract.json"
    fname = 'Illinois2015CensusTracts.json'
    target = data_dir + fname
    download_file(url, target)
    print("Successfully downloaded Illinois census tract shapefile.")
Пример #6
0
output = 'data/cleaned/01_Demographic'
# location of table shells where I"ve flagged which variables to use
input_drive = 'data/raw'

table_shell = os.path.join(input_drive, 'ACS2017_Table_Shells.xlsx')
xl = pd.ExcelFile(table_shell)
table_shell_df = xl.parse(xl.sheet_names[0])
# variables I've flagged to use
use_vars = table_shell_df[table_shell_df.Use == 1]
print(use_vars[['TableID', 'Stub', 'Use']])
use_vars.to_csv(os.path.join(input_drive, 'ACS_variables.csv'))
variables = use_vars.TableID.tolist()

# Use the census data package
# Examples of functionality
censusdata.search('acs5', 2017, 'label', 'unemploy')
# censusdata.search('acs5', 2017, 'concept', 'education')
censusdata.printtable(censusdata.censustable('acs5', 2017, 'B23025'))
censusdata.geographies(censusdata.censusgeo([('state', '*')]), 'acs5', 2017)
censusdata.geographies(
    censusdata.censusgeo([('state', '08'), ('county', '*')]), 'acs5', 2017)

# doesn't seem like the C variables work, so remove them
variables = [var for var in variables if 'C' not in var]
variables = [var for var in variables if "B17002" not in var]

# loop through all variables and merge data together
count = 0
for variable in variables:
    print(variable)
    data = censusdata_pull(variable)
county_id ##dictionary with name as key and census geo object
##list(county_id.keys())[0:10]

#austin_counties = ['021', '055', '209', '453', '491']
austin_counties = ['453']

## Get all the census tracts from the counties 
## state> county> tract

austin_tracts_names = []
for county_id in austin_counties:
  county_tracts = censusdata.geographies(censusdata.censusgeo([('state', '48'), ('county', county_id), ('tract', '*')]), 'acs5', 2016)
  austin_tracts_names.extend(county_tracts.keys())

## Search for all variables by the label 'housing', first 10
censusdata.search('acs5', 2016, 'label', 'white', 'profile')

#find all census tracts within the MSA and generate % white of every census tract  

austin_msa = censusdata.download('acs5', 2016,
                             censusdata.censusgeo([('metropolitan statistical area/micropolitan statistical area', '12420')]),
                             ['DP05_0032PE'])

##i think the above one returns the right %white population, turns out there a lot of yt ppl 
##think about: how does construction of hispanic/latino as a race shape this? 
##acs: "The data on race are based on self-identification and the categories on the form generally reflect a social definition of race"

## AH: Yes! Here's a good paper on this -- https://www.annualreviews.org/doi/abs/10.1146/annurev.soc.29.010202.100006

##austin_msa = censusdata.download('acs5', 2016,
                             #censusdata.censusgeo([('state', '48')]),
Пример #8
0
import os
import argparse


parser = argparse.ArgumentParser()
parser.add_argument('--search', action='store_true',
                    help="To perform a search for variables")
parser.add_argument('--get', action='store_true',
                    help="To load variables data from the CENSUS data and store to csv")
parser.add_argument('--store', action='store_true',
                    help="To load data from csv into the database")
args = parser.parse_args()

if args.search:
    # to search for variables in CENSUS data
    vars = censusdata.search('acs5', 2018, 'label', 'geoid', tabletype='detail')
    print(f"Found {len(vars)} matching variables.")
    # prints all retrieved census data variables to file
    with open("search_results.txt", "w") as f:
        for v in vars:
            f.write(str(v)+"\n")

if args.get:
    # to download the data from the CENSUS
    df = download_data('useful_variables.txt')
    # saves the retrieved data to a csv
    df.to_csv('data.csv', index=False)

if args.store:
    table_name = "amritsin_hw1"
    schema_name = "acs"
Пример #9
0
import censusdata

x = censusdata.search('acs5', 2015, 'label', 'unemploy')[160:170]

print(x)
Пример #10
0
#Creating dictionary mapping for census tract to zipcode
#to be able to map the census data to education data
census_zipcode_relation_filename = 'zcta_tract.csv'
census_zipcode_relation = pd.read_csv(census_zipcode_relation_filename, \
    delimiter=',', dtype=str)
ny_tract_to_zipcode = census_zipcode_relation[census_zipcode_relation\
['STATE'] == '36'][['TRACT', 'ZCTA5']]

tract_zipcode = {}
for row in ny_tract_to_zipcode.itertuples():
    tract_zipcode[row[1]] = row[2]


#Obtaining 2010 Census median income data for each borough in NYC
censusdata.search('acs5', 2015, 'label', 'median income')
censusdata.censustable('acs5', 2015, 'B06011')
median_income_bronx = censusdata.download('acs5', 2015, \
    censusdata.censusgeo([('state', '36'), ('county', '005'), \
        ('tract', '*')]), ['B06011_001E'])
median_income_kings = censusdata.download('acs5', 2015, \
    censusdata.censusgeo([('state', '36'), ('county', '047'), \
        ('tract', '*')]), ['B06011_001E'])
median_income_ny = censusdata.download('acs5', 2015, \
    censusdata.censusgeo([('state', '36'), ('county', '061'), \
        ('tract', '*')]), ['B06011_001E'])
median_income_queens = censusdata.download('acs5', 2015, \
    censusdata.censusgeo([('state', '36'), ('county', '081'), \
        ('tract', '*')]), ['B06011_001E'])
median_income_richmond = censusdata.download('acs5', 2015, \
    censusdata.censusgeo([('state', '36'), ('county', '085'), \
Пример #11
0
# # )
# # # print( getattr(censusdata.download,'name') )

# c = censusdata.download('acs5', 2015, censusdata.censusgeo([('county', 'Kings County')]),
#                                    ['B01001_001E', 'B01001_020E'])
# print(c.describe())

###############################################
import pandas as pd
import censusdata
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
import statsmodels.formula.api as sm

#Adding all the age columns
ageTableCols = censusdata.search('acs1', 2018, 'concept', 'age')
tableColDict = dict()
tableColCodeArray = []
for i in range(0, len(ageTableCols)):
    tableColCode = str(
        ageTableCols[i]).split(", ")[0].strip("(").strip(")").strip("'")
    tableColSection = str(
        ageTableCols[i]).split(", ")[1].strip("(").strip(")").strip("'")
    tableColVar = str(
        ageTableCols[i]).split(", ")[2].strip("(").strip(")").strip("'")
    tableColDict.update({tableColCode: [tableColSection, tableColVar]})
    tableColCodeArray.append(tableColCode)

a = [
    'B01001A_001E', 'B01]1A_002E', 'B01001A_003E', 'B01001A_004E',
    'B01001A_005E'