print df1.P0010001.sum() print print df1.P0010001.astype(int).sum() # <markdowncell> # **Q22**: Why is `df1.P0010001.sum()` different from `df1.P0010001.astype(int).sum()`? # <markdowncell> # **A22**: # The data type of `df1.P0010001` is a string. Hence, performing `sum` on it concatenates the string representation of populations into a longer string. In contrast, once `df1.P0010001` is converted into integers via `df1.P0010001.astype(int)`, a `sum` operation adds up all the populations into a single integer. # <codecell> df1.P0010001 = df1.P0010001.astype(int) df1[['NAME','P0010001']].sort('P0010001', ascending=True).head() # <markdowncell> # **Q23**: Describe the output of the following: # # ```Python # df1.P0010001 = df1.P0010001.astype(int) # df1[['NAME','P0010001']].sort('P0010001', ascending=True).head() # ``` # <markdowncell> # **A23**: # A DataFrame (with 5 rows and 2 columns (NAME, P0010001)) listing the 5 least populous states in ascending order by population.
from itertools import islice c=census.Census(settings.CENSUS_KEY) def places(variables="NAME"): for state in us.states.STATES: print state geo = {'for':'place:*', 'in':'state:{s_fips}'.format(s_fips=state.fips)} for place in c.sf1.get(variables, geo=geo): yield place r = list(islice(places("NAME,P0010001"), None)) places_df = DataFrame(r) places_df.P0010001 = places_df.P0010001.astype('int') places_df['FIPS'] = places_df.apply(lambda s: s['state']+s['place'], axis=1) print "number of places", len(places_df) print "total pop", places_df.P0010001.sum() places_df.head() assert places_df.P0010001.sum() == 228457238 # number of places in 2010 Census assert len(places_df) == 29261 # <markdowncell> # # Apply and lambda functions #
# placeholder generator # replace with your own code for k in []: yield k # <codecell> # use this code to run your code # I recommend replacing the None in islice to a small number to make sure you're on # the right track r = list(islice(places("NAME,P0010001"), None)) places_df = DataFrame(r) places_df.P0010001 = places_df.P0010001.astype("int") places_df["FIPS"] = places_df.apply(lambda s: s["state"] + s["place"], axis=1) print "number of places", len(places_df) print "total pop", places_df.P0010001.sum() places_df.head() # <codecell> # if you've done this correctly, the following asserts should stop complaining assert places_df.P0010001.sum() == 228457238 # number of places in 2010 Census assert len(places_df) == 29261
'for': 'place:*', 'in': 'state:{s_fips}'.format(s_fips=state.fips) } for place in c.sf1.get(variables, geo=geo): yield place # <markdowncell> # Now we compute a DataFrame for the places: `places_df` # <codecell> r = list(islice(places("NAME,P0010001"), None)) places_df = DataFrame(r) places_df.P0010001 = places_df.P0010001.astype('int') print "number of places", len(places_df) print "total pop", places_df.P0010001.sum() places_df.head() # <markdowncell> # We display the most populous places from California # <codecell> places_df[places_df.state == '06'].sort_index(by='P0010001', ascending=False).head() # <markdowncell>
df2 = diversity(r) # <codecell> df2.sort_index(by='entropy5',ascending=False) # <codecell> msas_list = list(islice(msas('NAME,P0010001'),None)) # <codecell> len(msas_list) # <codecell> df = DataFrame(msas_list) # <codecell> df.P0010001 = df.P0010001.astype('int') # <codecell> df.groupby('metropolitan statistical area/micropolitan statistical area').apply(lambda x:sum(x['P0010001'])) # <codecell> type(r)
for state in us.states.STATES: geo = {'for': 'county:*', 'in': 'state:{fips}'.format(fips=state.fips)} for county in c.sf1.get(variables, geo=geo): yield county # <codecell> counties_list = list(counties('NAME,P0010001')) # <codecell> # add up the population to make sure we have the total right counties_df = DataFrame(counties_list) counties_df.P0010001 = counties_df.P0010001.astype('int') counties_df.P0010001.sum() # <markdowncell> # One reason for writing all the counties in the form of a Python generator is tha you can easily control the number of counties we work with at any given time -- and then easily scaling out to get all of them. # <codecell> # make a list of the first ten counties from itertools import islice list(islice(counties2(), 10)) # <headingcell level=1>
def states(variables='NAME'): geo = {'for': 'state:*'} states_fips = set([state.fips for state in us.states.STATES]) # need to filter out non-states for r in c.sf1.get(variables, geo=geo, year=2010): if r['state'] in states_fips: yield r # <codecell> # make a dataframe from the total populations of states in the 2010 Census df = DataFrame(states('NAME,P0010001')) df.P0010001 = df.P0010001.astype('int') df.head() # <codecell> # check that that we have the right total population df.P0010001.sum() == 308745538 # <codecell> # add a column with the first letter # we'll be grouping states based on the first letter of the state NAME df['first_letter'] = df.NAME.apply(lambda s: s[0]) df.head()
for state in us.states.STATES: geo={'for':'county:*', 'in':'state:{fips}'.format(fips=state.fips)} for county in c.sf1.get(variables, geo=geo): yield county # <codecell> counties_list = list(counties('NAME,P0010001')) # <codecell> # add up the population to make sure we have the total right counties_df = DataFrame(counties_list) counties_df.P0010001 = counties_df.P0010001.astype('int') counties_df.P0010001.sum() # <markdowncell> # One reason for writing all the counties in the form of a Python generator is tha you can easily control the number of counties we work with at any given time -- and then easily scaling out to get all of them. # <codecell> # make a list of the first ten counties from itertools import islice list(islice(counties2(),10)) # <headingcell level=1>