def main(): """Aggregate various interpolation metrics nationwide_classifcations: nationwide block equivalency file We will also create the following aggregated files nationwide_district_contains_county nationwide_district_contains_district nationwide_district_county_intersection """ fips = state_fips() # Perform aggregations. Iterating through clean data and appending df = aggregate_nationwide(fips, 'classifications', ['state', 'GEOID10', 'pop']) df_dcc = aggregate_nationwide(fips, 'district_contains_county', ['state', 'COUNTYFP']) df_dcd = aggregate_nationwide(fips, 'district_contains_district', ['state', 'base', 'base_value']) df_dci = aggregate_nationwide(fips, 'district_county_intersection', ['state', 'COUNTYFP']) # Save aggregated files df.to_csv('nationwide_classifications.csv', index=False) df_dcc.to_csv('nationwide_district_contains_county.csv', index=False) df_dcd.to_csv('nationwide_district_contains_district.csv', index=False) df_dci.to_csv('nationwide_district_county_intersection.csv', index=False) return
def main(): """Clean all census data in preparation for analysis We start by creating new folders in a clean_data directory Next, we split nationwide shapefiles into statewide shapefiles Then, we rename and move state legislative districts Afterwards, we join census geography data to census population data Finally, we remove duplicative boundary shapefiles """ # Get list of state fips fips = state_fips() # Create folders for each state create_state_directories(fips) # Split nationwide files into state files split_counties(fips) split_congressional_districts(fips) # Move state legislative districts move_state_legislative_districts(fips) # Join census data join_census_geo_and_pop(fips) return
def main(): """Remove duplicative boundary shapefiles.""" # Get list of state fips fips = state_fips() # Remove duplicative boundaries remove_duplicative_boundaries(fips, 'cd') remove_duplicative_boundaries(fips, 'county') remove_duplicative_boundaries(fips, 'sldl') remove_duplicative_boundaries(fips, 'sldu') return
def main(): """Interpolate district boundaries on census block data.""" fips = state_fips() # Iterate over each state for state, fips_code in fips.items(): # Get the base bath to the state folder base_path = 'clean_data/' + state + '/' # Get county and redistricting plan shapefiles files = os.listdir(base_path) files = [x for x in files if x[-4:] == '.shp'] files = [x for x in files if 'blocks' not in x] counties = [x for x in files if 'county' in x] districts = [x for x in files if 'county' not in x] # Load most recent county file counties.sort() df = gpd.read_file(base_path + counties[-1]) # Add systematic countyfp if 'COUNTYFP00' in df.columns: df['COUNTYFP'] = df['COUNTYFP00'] if 'COUNTYFP10' in df.columns: df['COUNTYFP'] = df['COUNTYFP10'] # Iterate through each files keep_cols = ['COUNTYFP'] for file in districts: print('INTERSECTIONS', file, '\n') # Load the district file district_path = base_path + file df_dist = gpd.read_file(district_path) # Define relevant column names and add to keep columns district_year = get_district_year(file) dist_col = district_attribute(district_year) keep_cols.append(district_year) # Detect intersections df = county_district_intersections(df, district_year, df_dist, dist_col) # Save dataframe df = df[keep_cols] df.to_csv(base_path + state + '_district_county_intersection.csv', index=False) return
def main(): """Interpolate district boundaries on census block data.""" fips = state_fips() # Iterate over each state for state, fips_code in fips.items(): # Get the base bath to the state folder base_path = 'clean_data/' + state + '/' # Load state census block shapefile blocks_path = base_path + state + '_blocks.shp' df = gpd.read_file(blocks_path) # Load district contains district an rename columns to note imputation district_path = base_path + state + '_district_contains_district.csv' df_district = pd.read_csv(district_path) district_cols = list(df_district.columns) district_cols_base = district_cols[:3] district_cols_other = district_cols[3:] district_cols_other = [x + '_imputed' for x in district_cols_other] df_district.columns = district_cols_base + district_cols_other # Load district and county containment dataframes county_path = base_path + state + '_district_contains_county.csv' df_county = pd.read_csv(county_path) # Load in district county intersections inter_path = base_path + state + '_district_county_intersection.csv' df_inter = pd.read_csv(inter_path) # Get the relevant redistricting plans files = os.listdir(base_path) files = [x for x in files if x[-4:] == '.shp'] sldl = [x for x in files if 'sldl' in x] sldu = [x for x in files if 'sldu' in x] cd = [x for x in files if 'cd' in x] # Sort and recombine so we interpolate in proper order sldl.sort() sldu.sort() cd.sort() files = sldl + sldu + cd district_years = [get_district_year(x) for x in files] # Join most updated classifications class_path = base_path + state + '_classifications.csv' if os.path.isfile(class_path): df_class = pd.read_csv(class_path) df_class['GEOID10'] = df_class['GEOID10'].astype(str).str.zfill(15) df_class = df_class.drop('pop', axis=1) df = df.merge(df_class, on='GEOID10') # Iterate through each redistricting plan for file_ix, file in enumerate(files): # Get the district level and year district_year = get_district_year(file) # If redistricting plan has already been classified in the block # file we can continue if district_year in df.columns: if len(df[df[district_year].isna()]) == 0: continue # Reduce county contains dataframe and join df_county_plan = reduce_county_contains(df_county, district_year) df = add_district_contains_counties(df, df_county_plan, district_year) # Reduce county district intersection plan df_inter_plan = reduce_district_county_intersection( df_inter, district_year) # Split blocks into classified and unclassified df_classified = df[df[district_year].notna()] df_unclassified = df[df[district_year].isna()] # Show progress and load redistricting plan print('\nINTERPOLATING', file, len(df_classified), len(df_unclassified)) df_plan = gpd.read_file(base_path + file) # Distribute label to unclassified blocks if len(df_unclassified) > 0: dist_col = district_attribute(district_year) df_labeled = distribute_labels_by_subset( df_plan, dist_col, df_unclassified, district_year, df_inter_plan) # Combine classified and unclassified df_labeled = df_labeled.drop('check_districts', axis=1) df = df_classified.append(df_labeled) df = df.reset_index(drop=True) # Add district contains district print('\tDistrict Contains Districts') df = add_district_contains_district(df, df_district, district_year) # Remove imputed columns imputed_cols = list(df.columns) imputed_cols = [x for x in imputed_cols if 'imputed' in x] df = df.drop(columns=imputed_cols) # Get equivalency file for this plan equiv_path = base_path + state + '_classifications_' equiv_path += district_year + '.csv' df_equiv = df[['GEOID10', district_year]] df_equiv.to_csv(equiv_path, index=False) # Save block equivalency file for all plans print('\n\nSaving', state) state_path = base_path + state + '_classifications.csv' district_cols = set(district_years).intersection(set(df.columns)) df_state = df[['GEOID10', 'pop'] + list(district_cols)] df_state.to_csv(state_path, index=False) return
def main(): """Interpolate district boundaries on census block data.""" fips = state_fips() # Iterate over each state for state, fips_code in fips.items(): # Get the base bath to the state folder base_path = 'clean_data/' + state + '/' # Get county and redistricting plan shapefiles files = os.listdir(base_path) files = [x for x in files if x[-4:] == '.shp'] files = [x for x in files if 'blocks' not in x] counties = [x for x in files if 'county' in x] districts = [x for x in files if 'county' not in x] # Load most recent county file counties.sort() df = gpd.read_file(base_path + counties[-1]) # Add systematic countyfp if 'COUNTYFP00' in df.columns: df['COUNTYFP'] = df['COUNTYFP00'] if 'COUNTYFP10' in df.columns: df['COUNTYFP'] = df['COUNTYFP10'] # Iterate through each files keep_cols = ['COUNTYFP'] for file in districts: print('INTERPOLATING', file, '\n') # Load the district file district_path = base_path + file df_dist = gpd.read_file(district_path) # Define relevant column names and add to keep columns district_year = get_district_year(file) dist_col = district_attribute(district_year) keep_cols.append(district_year) # Distribute label df = distribute_label(df_dist, [dist_col], df, [district_year]) # Check if county is full contained, otherwise set to None for ix, row in df.iterrows(): # Get district and county geometry district = df_dist.loc[df_dist[dist_col] == row[district_year], 'geometry'].iloc[0] county = row['geometry'] # If district does not fully contain a county then set it # equal to null intersection_area = district.intersection(county).area area_ratio = intersection_area / county.area if area_ratio < 0.999: df.at[ix, district_year] = None # Save dataframe df = df[keep_cols] df.to_csv(base_path + state + '_district_contains_county.csv', index=False) return
def main(): """Interpolate district boundaries on census block data.""" fips = state_fips() # Iterate over each state for state, fips_code in fips.items(): # Get the base bath to the state folder base_path = 'clean_data/' + state + '/' # Get the relevant redistricting plans files = os.listdir(base_path) files = [x for x in files if x[-4:] == '.shp'] sldl = [x for x in files if 'sldl' in x] sldu = [x for x in files if 'sldu' in x] cd = [x for x in files if 'cd' in x] # Sort and recombine so we interpolate in proper order sldl.sort() sldu.sort() cd.sort() files = sldl + sldu + cd # Initialize dataframe df = pd.DataFrame() # Iterate through all redistricting plans for ix, base_file in enumerate(files): # If we are on the last file do not interpolate if ix == len(files) - 1: break # Create list of other files we will interpolate interpolate_files = files[ix + 1:] # Load the base file df_base = gpd.read_file(base_path + base_file) # Define relevant column names base_district_year = get_district_year(base_file) base_id_col = district_attribute(base_district_year) # Create relevant base_cols df_base['base'] = base_district_year df_base['base_col'] = base_id_col df_base['base_value'] = df_base[base_id_col] # Set keep columns keep_cols = ['base', 'base_col', 'base_value'] # Iterate through the interpolate files for inter_file in interpolate_files: # Load the interpolation district file df_inter = gpd.read_file(base_path + inter_file) # Define relevant column names inter_district_year = get_district_year(inter_file) inter_id_col = district_attribute(inter_district_year) # Print progress print(state, base_district_year, inter_district_year) # Add to keep columns keep_cols.append(inter_district_year) # Distribute label df_base = distribute_label(df_inter, [inter_id_col], df_base, [inter_district_year]) # Check if district is fully contained, otherwise set to None for ix, row in df_base.iterrows(): base_district = row['geometry'] inter_district = df_inter.loc[df_inter[inter_id_col] == row[inter_district_year], 'geometry'].iloc[0] # If interpolation district does not fully contain the # base district then we set its value to null intersect = base_district.intersection(inter_district) intersection_area = intersect.area area_ratio = intersection_area / base_district.area if area_ratio < 0.999: df_base.at[ix, inter_district_year] = None # Reduce to relevant keep columns df_base = df_base[keep_cols] df = df.append(df_base) # Save dataframe df.to_csv(base_path + state + '_district_contains_district.csv', index=False) return