示例#1
0
def main():
    """Aggregate various interpolation metrics

    nationwide_classifcations: nationwide block equivalency file

    We will also create the following aggregated files
        nationwide_district_contains_county
        nationwide_district_contains_district
        nationwide_district_county_intersection
    """
    fips = state_fips()

    # Perform aggregations. Iterating through clean data and appending
    df = aggregate_nationwide(fips, 'classifications',
                              ['state', 'GEOID10', 'pop'])
    df_dcc = aggregate_nationwide(fips, 'district_contains_county',
                                  ['state', 'COUNTYFP'])
    df_dcd = aggregate_nationwide(fips, 'district_contains_district',
                                  ['state', 'base', 'base_value'])
    df_dci = aggregate_nationwide(fips, 'district_county_intersection',
                                  ['state', 'COUNTYFP'])

    # Save aggregated files
    df.to_csv('nationwide_classifications.csv', index=False)
    df_dcc.to_csv('nationwide_district_contains_county.csv', index=False)
    df_dcd.to_csv('nationwide_district_contains_district.csv', index=False)
    df_dci.to_csv('nationwide_district_county_intersection.csv', index=False)
    return
示例#2
0
def main():
    """Clean all census data in preparation for analysis

    We start by creating new folders in a clean_data directory

    Next, we split nationwide shapefiles into statewide shapefiles

    Then, we rename and move state legislative districts

    Afterwards, we join census geography data to census population data

    Finally, we remove duplicative boundary shapefiles
    """
    # Get list of state fips
    fips = state_fips()

    # Create folders for each state
    create_state_directories(fips)

    # Split nationwide files into state files
    split_counties(fips)
    split_congressional_districts(fips)

    # Move state legislative districts
    move_state_legislative_districts(fips)

    # Join census data
    join_census_geo_and_pop(fips)
    return
示例#3
0
def main():
    """Remove duplicative boundary shapefiles."""
    # Get list of state fips
    fips = state_fips()

    # Remove duplicative boundaries
    remove_duplicative_boundaries(fips, 'cd')
    remove_duplicative_boundaries(fips, 'county')
    remove_duplicative_boundaries(fips, 'sldl')
    remove_duplicative_boundaries(fips, 'sldu')
    return
示例#4
0
def main():
    """Interpolate district boundaries on census block data."""
    fips = state_fips()

    # Iterate over each state
    for state, fips_code in fips.items():
        # Get the base bath to the state folder
        base_path = 'clean_data/' + state + '/'

        # Get county and redistricting plan shapefiles
        files = os.listdir(base_path)
        files = [x for x in files if x[-4:] == '.shp']
        files = [x for x in files if 'blocks' not in x]
        counties = [x for x in files if 'county' in x]
        districts = [x for x in files if 'county' not in x]

        # Load most recent county file
        counties.sort()
        df = gpd.read_file(base_path + counties[-1])

        # Add systematic countyfp
        if 'COUNTYFP00' in df.columns:
            df['COUNTYFP'] = df['COUNTYFP00']
        if 'COUNTYFP10' in df.columns:
            df['COUNTYFP'] = df['COUNTYFP10']

        # Iterate through each files
        keep_cols = ['COUNTYFP']
        for file in districts:
            print('INTERSECTIONS', file, '\n')
            # Load the district file
            district_path = base_path + file
            df_dist = gpd.read_file(district_path)

            # Define relevant column names and add to keep columns
            district_year = get_district_year(file)
            dist_col = district_attribute(district_year)
            keep_cols.append(district_year)

            # Detect intersections
            df = county_district_intersections(df, district_year, df_dist,
                                               dist_col)

        # Save dataframe
        df = df[keep_cols]
        df.to_csv(base_path + state + '_district_county_intersection.csv',
                  index=False)

    return
示例#5
0
def main():
    """Interpolate district boundaries on census block data."""
    fips = state_fips()

    # Iterate over each state
    for state, fips_code in fips.items():
        # Get the base bath to the state folder
        base_path = 'clean_data/' + state + '/'

        # Load state census block shapefile
        blocks_path = base_path + state + '_blocks.shp'
        df = gpd.read_file(blocks_path)

        # Load district contains district an rename columns to note imputation
        district_path = base_path + state + '_district_contains_district.csv'
        df_district = pd.read_csv(district_path)
        district_cols = list(df_district.columns)
        district_cols_base = district_cols[:3]
        district_cols_other = district_cols[3:]
        district_cols_other = [x + '_imputed' for x in district_cols_other]
        df_district.columns = district_cols_base + district_cols_other

        # Load district and county containment dataframes
        county_path = base_path + state + '_district_contains_county.csv'
        df_county = pd.read_csv(county_path)

        # Load in district county intersections
        inter_path = base_path + state + '_district_county_intersection.csv'
        df_inter = pd.read_csv(inter_path)

        # Get the relevant redistricting plans
        files = os.listdir(base_path)
        files = [x for x in files if x[-4:] == '.shp']
        sldl = [x for x in files if 'sldl' in x]
        sldu = [x for x in files if 'sldu' in x]
        cd = [x for x in files if 'cd' in x]

        # Sort and recombine so we interpolate in proper order
        sldl.sort()
        sldu.sort()
        cd.sort()
        files = sldl + sldu + cd
        district_years = [get_district_year(x) for x in files]

        # Join most updated classifications
        class_path = base_path + state + '_classifications.csv'
        if os.path.isfile(class_path):
            df_class = pd.read_csv(class_path)
            df_class['GEOID10'] = df_class['GEOID10'].astype(str).str.zfill(15)
            df_class = df_class.drop('pop', axis=1)
            df = df.merge(df_class, on='GEOID10')

        # Iterate through each redistricting plan
        for file_ix, file in enumerate(files):
            # Get the district level and year
            district_year = get_district_year(file)

            # If redistricting plan has already been classified in the block
            # file we can continue
            if district_year in df.columns:
                if len(df[df[district_year].isna()]) == 0:
                    continue

            # Reduce county contains dataframe and join
            df_county_plan = reduce_county_contains(df_county, district_year)
            df = add_district_contains_counties(df, df_county_plan,
                                                district_year)

            # Reduce county district intersection plan
            df_inter_plan = reduce_district_county_intersection(
                df_inter, district_year)

            # Split blocks into classified and unclassified
            df_classified = df[df[district_year].notna()]
            df_unclassified = df[df[district_year].isna()]

            # Show progress and load redistricting plan
            print('\nINTERPOLATING', file, len(df_classified),
                  len(df_unclassified))
            df_plan = gpd.read_file(base_path + file)

            # Distribute label to unclassified blocks
            if len(df_unclassified) > 0:
                dist_col = district_attribute(district_year)
                df_labeled = distribute_labels_by_subset(
                    df_plan, dist_col, df_unclassified, district_year,
                    df_inter_plan)

                # Combine classified and unclassified
                df_labeled = df_labeled.drop('check_districts', axis=1)
                df = df_classified.append(df_labeled)
            df = df.reset_index(drop=True)

            # Add district contains district
            print('\tDistrict Contains Districts')
            df = add_district_contains_district(df, df_district, district_year)

            # Remove imputed columns
            imputed_cols = list(df.columns)
            imputed_cols = [x for x in imputed_cols if 'imputed' in x]
            df = df.drop(columns=imputed_cols)

            # Get equivalency file for this plan
            equiv_path = base_path + state + '_classifications_'
            equiv_path += district_year + '.csv'
            df_equiv = df[['GEOID10', district_year]]
            df_equiv.to_csv(equiv_path, index=False)

            # Save block equivalency file for all plans
            print('\n\nSaving', state)
            state_path = base_path + state + '_classifications.csv'
            district_cols = set(district_years).intersection(set(df.columns))
            df_state = df[['GEOID10', 'pop'] + list(district_cols)]
            df_state.to_csv(state_path, index=False)

    return
示例#6
0
def main():
    """Interpolate district boundaries on census block data."""
    fips = state_fips()

    # Iterate over each state
    for state, fips_code in fips.items():
        # Get the base bath to the state folder
        base_path = 'clean_data/' + state + '/'

        # Get county and redistricting plan shapefiles
        files = os.listdir(base_path)
        files = [x for x in files if x[-4:] == '.shp']
        files = [x for x in files if 'blocks' not in x]
        counties = [x for x in files if 'county' in x]
        districts = [x for x in files if 'county' not in x]

        # Load most recent county file
        counties.sort()
        df = gpd.read_file(base_path + counties[-1])

        # Add systematic countyfp
        if 'COUNTYFP00' in df.columns:
            df['COUNTYFP'] = df['COUNTYFP00']
        if 'COUNTYFP10' in df.columns:
            df['COUNTYFP'] = df['COUNTYFP10']

        # Iterate through each files
        keep_cols = ['COUNTYFP']
        for file in districts:
            print('INTERPOLATING', file, '\n')
            # Load the district file
            district_path = base_path + file
            df_dist = gpd.read_file(district_path)

            # Define relevant column names and add to keep columns
            district_year = get_district_year(file)
            dist_col = district_attribute(district_year)
            keep_cols.append(district_year)

            # Distribute label
            df = distribute_label(df_dist, [dist_col], df, [district_year])

            # Check if county is full contained, otherwise set to None
            for ix, row in df.iterrows():
                # Get district and county geometry
                district = df_dist.loc[df_dist[dist_col] == row[district_year],
                                       'geometry'].iloc[0]
                county = row['geometry']

                # If district does not fully contain a county then set it
                # equal to null
                intersection_area = district.intersection(county).area
                area_ratio = intersection_area / county.area
                if area_ratio < 0.999:
                    df.at[ix, district_year] = None

        # Save dataframe
        df = df[keep_cols]
        df.to_csv(base_path + state + '_district_contains_county.csv',
                  index=False)

    return
示例#7
0
def main():
    """Interpolate district boundaries on census block data."""
    fips = state_fips()

    # Iterate over each state
    for state, fips_code in fips.items():
        # Get the base bath to the state folder
        base_path = 'clean_data/' + state + '/'

        # Get the relevant redistricting plans
        files = os.listdir(base_path)
        files = [x for x in files if x[-4:] == '.shp']
        sldl = [x for x in files if 'sldl' in x]
        sldu = [x for x in files if 'sldu' in x]
        cd = [x for x in files if 'cd' in x]

        # Sort and recombine so we interpolate in proper order
        sldl.sort()
        sldu.sort()
        cd.sort()
        files = sldl + sldu + cd

        # Initialize dataframe
        df = pd.DataFrame()

        # Iterate through all redistricting plans
        for ix, base_file in enumerate(files):
            # If we are on the last file do not interpolate
            if ix == len(files) - 1:
                break

            # Create list of other files we will interpolate
            interpolate_files = files[ix + 1:]

            # Load the base file
            df_base = gpd.read_file(base_path + base_file)

            # Define relevant column names
            base_district_year = get_district_year(base_file)
            base_id_col = district_attribute(base_district_year)

            # Create relevant base_cols
            df_base['base'] = base_district_year
            df_base['base_col'] = base_id_col
            df_base['base_value'] = df_base[base_id_col]

            # Set keep columns
            keep_cols = ['base', 'base_col', 'base_value']

            # Iterate through the interpolate files
            for inter_file in interpolate_files:
                # Load the interpolation district file
                df_inter = gpd.read_file(base_path + inter_file)

                # Define relevant column names
                inter_district_year = get_district_year(inter_file)
                inter_id_col = district_attribute(inter_district_year)

                # Print progress
                print(state, base_district_year, inter_district_year)

                # Add to keep columns
                keep_cols.append(inter_district_year)

                # Distribute label
                df_base = distribute_label(df_inter, [inter_id_col], df_base,
                                           [inter_district_year])

                # Check if district is fully contained, otherwise set to None
                for ix, row in df_base.iterrows():
                    base_district = row['geometry']
                    inter_district = df_inter.loc[df_inter[inter_id_col] ==
                                                  row[inter_district_year],
                                                  'geometry'].iloc[0]

                    # If interpolation district does not fully contain  the
                    # base district then we set its value to null
                    intersect = base_district.intersection(inter_district)
                    intersection_area = intersect.area
                    area_ratio = intersection_area / base_district.area
                    if area_ratio < 0.999:
                        df_base.at[ix, inter_district_year] = None

            # Reduce to relevant keep columns
            df_base = df_base[keep_cols]
            df = df.append(df_base)

        # Save dataframe
        df.to_csv(base_path + state + '_district_contains_district.csv',
                  index=False)
    return