def get_county_populations_1980s( data_path: Optional[str] = None) -> pd.DataFrame: dfs = [] for year in range(1980, 1990): df = pd.read_excel( get_path( f"https://www2.census.gov/programs-surveys/popest/tables/1980-1990/counties/asrh/pe-02-{year}.xls", data_path, ), skiprows=5, ) df = df.rename( columns={ "Year of Estimate": "year", "FIPS State and County Codes": "combined_fips", }) df = (df.dropna(subset=["year"]).groupby([ "year", "combined_fips" ]).sum().sum(axis=1).rename("population").reset_index()) dfs.append(df) combined_df = pd.concat(dfs) combined_df["combined_fips"] = combined_df["combined_fips"].astype("Int64") combined_df["year"] = combined_df["year"].astype("Int64").astype(str) combined_df["state_code"] = combined_df["combined_fips"] // 1000 combined_df["county_code"] = combined_df["combined_fips"] % 1000 combined_df = combined_df.drop(columns=["combined_fips"]) return combined_df
def get_state_populations_2010_through_2019( data_path: Optional[str] = None, ) -> pd.DataFrame: """ This function is not used anymore """ df = pd.read_excel( get_path( "https://www2.census.gov/programs-surveys/popest/tables/2010-2019/state/totals/nst-est2019-01.xlsx", data_path, ), skiprows=3, skipfooter=5, ) df = df.rename(columns={"Unnamed: 0": "state"}).dropna(subset=["state"]) df.columns = df.columns.astype(str) # IDK what to do with this # (also not sure what the difference is between the "Census", "Estimates Base", and "2010" columns df = df.drop(columns=["Census", "Estimates Base"]) df["state"] = df["state"].str.lstrip(".") df = df.astype({col: int for col in df.columns if col != "state"}) return df.melt(id_vars="state", var_name="year", value_name="population")
def get_state_populations_2000s( data_path: Optional[str] = None) -> pd.DataFrame: df = pd.read_excel( get_path( "https://www2.census.gov/programs-surveys/popest/tables/2000-2010/intercensal/state/st-est00int-01.xls", data_path, ), skiprows=3, skipfooter=8, ) df = df.rename( columns={ "Unnamed: 0": "state", "Unnamed: 1": "2010-04-01", "Unnamed: 12": "2020-04-01", "Unnamed: 13": "2020-07-01", }).dropna(subset=["state"]) df.columns = df.columns.astype(str) df["state"] = df["state"].str.lstrip(".") df = df.astype({col: int for col in df.columns if col != "state"}) # We don't need these df = df.drop(columns=["2010-04-01", "2020-04-01", "2020-07-01"]) return df.melt(id_vars="state", var_name="year", value_name="population")
def _get_counties_population_table_1990s(year: int, data_path: Optional[str] = None ) -> pd.DataFrame: assert 1990 <= year <= 1999 df = pd.read_csv( get_path( f"https://www2.census.gov/programs-surveys/popest/tables/1990-2000/intercensal/st-co/stch-icen{year}.txt", data_path, ), delim_whitespace=True, names=[ "year", "state_county_code", "age_group", "race_sex", "ethnic_origin", "population", ], dtype=int, ) # the county code is formatted as a 5-digit number - first 2 digits are state code, next 3 are county code df["state_code"] = df["state_county_code"] // 1000 df["year"] = "19" + df["year"].astype(str) FIPS_NAME_MAPPING = { int(k): v for k, v in us.states.mapping("fips", "name").items() if k is not None } df["state"] = df["state_code"].map(FIPS_NAME_MAPPING) return df
def get_place_populations_2010s(data_path: Optional[str] = None) -> pd.DataFrame: # This one has consolidated cities that need to be removed return _get_recent_decades_df( get_path( "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/cities/SUB-EST2020_ALL.csv", data_path, ), has_consolidated_cities=True, years=list(range(2010, 2021)), )
def get_place_populations_2000s(data_path: Optional[str] = None) -> pd.DataFrame: # This one doesn't include consolidated cities, so no need to remove those rows return _get_recent_decades_df( get_path( "https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/cities/sub-est00int.csv", data_path, ), has_consolidated_cities=False, years=list(range(2000, 2011)), )
def _get_places_crosswalk_df(data_path: Optional[str] = None) -> pd.DataFrame: df = pd.read_fwf( get_path( "https://www2.census.gov/geo/tiger/PREVGENZ/pl/us_places.txt", data_path ) ) df["State Code"] = df["CENSUS"] // 10000 df["Place Code"] = df["CENSUS"] % 10000 df = df.rename(columns={"FIPS": "place_fips"}) df["place_fips"] = df["place_fips"].astype("Int64") return df
def get_county_fips_crosswalk(data_path: Optional[str] = None) -> pd.DataFrame: df = pd.read_excel( get_path( "https://www2.census.gov/programs-surveys/popest/geographies/2019/all-geocodes-v2019.xlsx", data_path, ), skiprows=4, ) df = df[df["County Code (FIPS)"] != 0] rename_cols = { "State Code (FIPS)": "state_code", "County Code (FIPS)": "county_code", "Area Name (including legal/statistical area description)": "county_name", } df = df[rename_cols.keys()].rename(columns=rename_cols) return df
def get_state_populations_2010s( data_path: Optional[str] = None) -> pd.DataFrame: """ This one goes through 2020 """ df = pd.read_csv( get_path( "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/state/totals/nst-est2020-alldata.csv", data_path, )) return (df[["NAME"] + [f"POPESTIMATE{year}" for year in range(2010, 2021)]].rename(columns={ f"POPESTIMATE{year}": str(year) for year in range(2010, 2021) }).rename(columns={ "NAME": "state" }).melt(id_vars=["state"], var_name="year", value_name="population"))
def get_county_populations_2010s(data_path: Optional[str] = None): df = pd.read_csv( get_path( "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/counties/totals/co-est2020-alldata.csv", data_path, ), encoding="latin_1", ) rename_cols = { "POPESTIMATE" + str(year): str(year) for year in range(2010, 2021) } rename_cols.update({"STATE": "state_code", "COUNTY": "county_code"}) df = df[rename_cols.keys()].rename(columns=rename_cols) df = df.melt(id_vars=["county_code", "state_code"], var_name="year", value_name="population") return df
def get_county_populations_2000s( data_path: Optional[str] = None) -> pd.DataFrame: urls = [ ( state.fips, f"https://www2.census.gov/programs-surveys/popest/tables/2000-2010/" f"intercensal/county/co-est00int-01-{state.fips}.csv", ) for state in us.STATES_AND_TERRITORIES if state.fips not in ["60", "66", "69", "72", "78"] # exclude territories ] col_names = [ "County Name", "2000-04-01", "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010-04-01", "2010", ] dfs = [] for state_code, url in urls: df = pd.read_csv( get_path(url, data_path), names=col_names, skiprows=4, skipfooter=8, encoding="latin_1", ) df["state_code"] = state_code df["County Name"] = df["County Name"].str.lstrip(".") dfs.append(df) df = pd.concat(dfs) df = df.rename(columns={"County Name": "county_name"}) df = df.drop(columns=["2000-04-01", "2010-04-01"]) df["state_code"] = df["state_code"].astype(int) df = df.melt(id_vars=["county_name", "state_code"], var_name="year", value_name="population") df = df.merge(get_county_fips_crosswalk(), how="left", on=["county_name", "state_code"]) df = df.drop(columns=["county_name"]) df = df[df["county_code"].notnull()].copy() df["population"] = (df["population"].str.replace( ",", "").astype("float").astype("Int64")) df["county_code"] = df["county_code"].astype("Int64") # Use 2010 from the 2010s dataset # (I would like to do some smoothing later but let's not worry about that for now) df = df[df["year"] != "2010"].copy() return df