def usgs_fba_data_cleanup(df): """ Clean up the dataframe to prepare for flowbysector. Used in flowbysector.py :param df: df, FBA format :return: df, modified FBA """ # drop rows of commercial data (because only exists for 3 states), # causes issues because linked with public supply # also drop closed-loop or once-through cooling (thermoelectric power) # to avoid double counting vLogDetailed.info('Removing all rows for Commercial Data because does not ' 'exist for all states and causes issues as information ' 'on Public Supply deliveries.') dfa = df[~df['Description'].str.lower().str. contains('commercial|closed-loop cooling|once-through')] calculate_flowamount_diff_between_dfs(df, dfa) # calculated NET PUBLIC SUPPLY by subtracting out deliveries to domestic vLogDetailed.info('Modify the public supply values to generate ' 'NET public supply by subtracting out deliveries ' 'to domestic') dfb = calculate_net_public_supply(dfa) # check that golf + crop = total irrigation, if not, # assign all of total irrigation to crop vLogDetailed.info('If states do not distinguish between golf and crop ' 'irrigation as a subset of total irrigation, assign ' 'all of total irrigation to crop') dfc = check_golf_and_crop_irrigation_totals(dfb) # national df1 = dfc[dfc['Location'] == US_FIPS] # drop flowname = 'total' rows when possible to prevent double counting # subset data where flowname = total and where it does not vLogDetailed.info('Drop rows where the FlowName is total to prevent' 'double counting at the state and county levels. ' 'Retain rows at national level') df2 = dfc[dfc['FlowName'] == 'total'] # set conditions for data to keep when flowname = 'total c1 = df2['Location'] != US_FIPS c2 = (~df2['ActivityProducedBy'].isnull()) & \ (~df2['ActivityConsumedBy'].isnull()) # subset data df2 = df2[c1 & c2].reset_index(drop=True) # second subset doesn't have total flowname or total compartment df3 = dfc[dfc['FlowName'] != 'total'] df3 = df3[df3['Compartment'] != 'total'] df3 = df3[df3['Location'] != US_FIPS] # concat the two df dfd = pd.concat([df1, df2, df3], ignore_index=True, sort=False) # In 2015, there is data for consumptive water use for # thermo and crop, drop because do not calculate consumptive water loss # for all water categories dfd = dfd[dfd['Compartment'] != 'air'].reset_index(drop=True) return dfd
def mecs_land_fba_cleanup_for_land_2012_fbs(fba): """ The 'land_national_2012' FlowBySector uses MECS 2014 data, set MECS year to 2012 :param fba: df, EIA MECS Land, FBA format :return: df, EIA MECS Land FBA modified """ fba = mecs_land_fba_cleanup(fba) # reset the EIA MECS Land year from 2014 to 2012 to match # the USDA ERS MLU year vLogDetailed.info('Resetting year from 2014 to 2012') fba['Year'] = 2012 return fba
def calculate_total_facility_land_area(df): """ In land use calculations, in addition to the provided floor area of buildings, estimate other related land area associated with commercial facilities (parking, signage, and landscaped area) :param df: df, eia cbecs land :return: df, modified eia cbecs land that incorporates additional land area for each activity """ floor_space_to_land_area_ratio = \ get_commercial_and_manufacturing_floorspace_to_land_area_ratio() vLogDetailed.info('Modifying FlowAmounts - Assuming the floor space to ' 'land area ratio is 1:4') df = df.assign( FlowAmount=(df['FlowAmount'] / floor_space_to_land_area_ratio) - df['FlowAmount']) return df
def cbecs_land_fba_cleanup(fba_load): """ Clean up the land fba for use in allocation :param fba_load: df, eia cbecs land flowbyactivity format :return: df, flowbyactivity with modified values """ # estimate floor space using number of floors fba = calculate_floorspace_based_on_number_of_floors(fba_load) # calculate the land area in addition to building footprint fba1 = calculate_total_facility_land_area(fba) # drop activities of 'all buildings' to avoid double counting fba2 = fba1[fba1['ActivityConsumedBy'] != 'All buildings'].reset_index( drop=True) vLogDetailed.info('Drop the principle building activity "All buildings" ' 'to avoid double counting') calculate_flowamount_diff_between_dfs(fba1, fba2) return fba2
def calculate_floorspace_based_on_number_of_floors(fba_load): """ Estimate total floorspace for each building type based on data on the number of floors for each building type. Assumptions (Taken from Yang's static satellite tables): 1. When floor range is 4-9, assume 6 stories 2. When floor range is 10 or more, assume 15 stories :param fba_load: df, eia cbecs land flowbyactivity :return: df, eia cbecs land fba with estimated total floorspace """ # disaggregate mercentile to malls and non malls fba = disaggregate_eia_cbecs_mercentile(fba_load) vLogDetailed.info('Calculate floorspace for mall and nonmall buildings ' 'with different number of floors. Once calculated, ' 'drop mercantile data from dataframe to avoid double ' 'counting.') calculate_flowamount_diff_between_dfs(fba_load, fba) # disaggregate other and vacant fba2 = disaggregate_eia_cbecs_vacant_and_other(fba) vLogDetailed.info('Due to data suppression for floorspace by building ' 'number of floors, some data is lost when dropping ' 'floorspace for all buildings within a principle ' 'building activity. To avoid this data loss, all ' 'remaining floorspace for "All buildings" by number of ' 'floors is allocated to "Vacant" and "Other" principle ' 'building activities, as these activities are allocated ' 'to all commercial building sectors. This assumption ' 'results in a total floorspace increase for "Vacant" ' 'and "Other" activities.') calculate_flowamount_diff_between_dfs(fba, fba2) # drop data for 'all buildings' fba3 = fba2[fba2['Description'] != 'All buildings'] # add column 'DivisionFactor' based on description fba3 = fba3.assign(DivisionFactor=fba3['Description'].apply(lambda x: ( 1 if 'One' in x else (2 if 'Two' in x else (3 if 'Three' in x else ( 6 if 'Four' in x else (15 if 'Ten' in x else ""))))))) # modify flowamounts to represent building footprint rather than # total floorspace fba3['FlowAmount'] = fba3['FlowAmount'] / fba3['DivisionFactor'] # sum values for single flowamount for each bulding type vLogDetailed.info('Drop flows for "All Buildings" to avoid double ' 'counting, as maintain floorspace by buildings based ' 'on number of floors. Also dividing total floorspace ' 'by number of floors to calculate a building footprint. ' 'Calculates result in reduced FlowAmount for all ' 'categories.') calculate_flowamount_diff_between_dfs(fba2, fba3) # rename the FlowAmounts and sum so total floorspace, rather than have # multiple rows based on floors fba3 = fba3.assign( FlowName=fba3['FlowName'].apply(lambda x: ','.join(x.split(',')[:-1]))) # modify the description fba3 = fba3.assign(Description='Building Footprint') groupbycols = fba_mapped_default_grouping_fields fba4 = aggregator(fba3, groupbycols) return fba4
def allocate_usda_ers_mlu_other_land(df, attr, fbs_list): """ From the USDA ERS MLU 2012 report: "Includes miscellaneous other uses, such as industrial and commercial sites in rural areas, cemeteries, golf courses, mining areas, quarry sites, marshes, swamps, sand dunes, bare rocks, deserts, tundra, rural residential, and other unclassified land. In this report, urban land is reported as a separate category." Mining data is calculated using a separate source = BLM PLS. Want to extract rural residential land area from total value of 'Other Land' :param df: df, USDA ERA MLU Land :param attr: dictionary, attribute data from method yaml for activity set :param fbs_list: list, FBS dfs for activities created prior to the activity set that calls on this fxn :return: df, allocated USDS ERS MLU Land, FBS format """ # land in rural residential lots rural_res = get_area_of_rural_land_occupied_by_houses_2013() # household codes household = load_crosswalk('household') household = household['Code'].drop_duplicates().tolist() # in df, where sector is a personal expenditure value, and # location = 00000, replace with rural res value vLogDetailed.info('The only category for MLU other land use is rural land ' 'occupation. All other land area in this category is ' 'unassigned to sectors, resulting in unaccounted land ' 'area.') df['FlowAmount'] = np.where(df['SectorConsumedBy'].isin(household), rural_res, df['FlowAmount']) return df
def compare_geographic_totals(df_subset, df_load, sourcename, attr, activity_set, activity_names): """ Check for any data loss between the geoscale used and published national data :param df_subset: df, after subset by geography :param df_load: df, loaded data, including published national data :param sourcename: str, source name :param attr: dictionary, attributes :param activity_set: str, activity set :param activity_names: list of names in the activity set by which to subset national level data :return: df, comparing published national level data to df subset """ # subset df_load to national level nat = df_load[df_load['Location'] == US_FIPS].reset_index( drop=True).rename(columns={'FlowAmount': 'FlowAmount_nat'}) # if df len is not 0, continue with comparison if len(nat) != 0: # subset national level data by activity set names nat = nat[(nat[fba_activity_fields[0]].isin(activity_names)) | ( nat[fba_activity_fields[1]].isin(activity_names))].reset_index( drop=True) nat = replace_strings_with_NoneType(nat) # drop the geoscale in df_subset and sum sub = df_subset.assign(Location=US_FIPS) # depending on the datasource, might need to rename some # strings for national comparison sub = rename_column_values_for_comparison(sub, sourcename) sub2 = aggregator(sub, fba_default_grouping_fields).rename( columns={'FlowAmount': 'FlowAmount_sub'}) # compare df merge_cols = [ 'Class', 'SourceName', 'FlowName', 'Unit', 'FlowType', 'ActivityProducedBy', 'ActivityConsumedBy', 'Compartment', 'Location', 'LocationSystem', 'Year' ] # comapare units compare_df_units(nat, sub2) df_m = pd.merge(nat[merge_cols + ['FlowAmount_nat']], sub2[merge_cols + ['FlowAmount_sub']], how='outer') df_m = df_m.assign(FlowAmount_diff=df_m['FlowAmount_nat'] - df_m['FlowAmount_sub']) df_m = df_m.assign(Percent_Diff=( abs(df_m['FlowAmount_diff'] / df_m['FlowAmount_nat']) * 100)) df_m = df_m[df_m['FlowAmount_diff'] != 0].reset_index(drop=True) # subset the merged df to what to include in the validation df # include data where percent difference is > 1 or where value is nan df_m_sub = df_m[(df_m['Percent_Diff'] > 1) | (df_m['Percent_Diff'].isna())].reset_index(drop=True) if len(df_m_sub) == 0: vLog.info( 'No data loss greater than 1%% between national ' 'level data and %s subset', attr['allocation_from_scale']) else: vLog.info( 'There are data differences between published national' ' values and %s subset, saving to validation log', attr['allocation_from_scale']) vLogDetailed.info( 'Comparison of National FlowAmounts to aggregated data ' 'subset for %s: \n {}'.format(df_m_sub.to_string()), activity_set)
def compare_fba_geo_subset_and_fbs_output_totals(fba_load, fbs_load, activity_set, source_name, source_attr, activity_attr, method): """ Function to compare the loaded flowbyactivity total after subsetting by activity and geography with the final flowbysector output total. Not a direct comparison of the loaded FBA because FBAs are modified before being subset by activity for the target sector level :param fba_load: df, FBA loaded, before being mapped :param fbs_load: df, final FBS df at target sector level :param activity_set: str, activity set :param source_name: str, source name :param source_attr: dictionary, attribute data from method yaml for source data :param activity_attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :return: printout data differences between loaded FBA and FBS output totals by location, save results as csv in local directory """ vLog.info('Comparing Flow-By-Activity subset by activity and geography to ' 'the subset Flow-By-Sector FlowAmount total.') # determine from scale if fips_number_key[source_attr['geoscale_to_use']] < \ fips_number_key[activity_attr['allocation_from_scale']]: from_scale = source_attr['geoscale_to_use'] else: from_scale = activity_attr['allocation_from_scale'] # extract relevant geoscale data or aggregate existing data fba = subset_df_by_geoscale(fba_load, from_scale, method['target_geoscale']) if check_activities_sector_like(source_name): # if activities are sector-like, run sector aggregation and then # subset df to only keep NAICS2 fba = fba[[ 'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'Location', 'LocationSystem' ]] # rename the activity cols to sector cols for purposes of aggregation fba = fba.rename( columns={ 'ActivityProducedBy': 'SectorProducedBy', 'ActivityConsumedBy': 'SectorConsumedBy' }) group_cols_agg = [ 'Class', 'Context', 'Unit', 'Location', 'LocationSystem', 'SectorProducedBy', 'SectorConsumedBy' ] fba = sector_aggregation(fba, group_cols_agg) # subset fba to only include NAICS2 fba = replace_NoneType_with_empty_cells(fba) fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2) | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)] # subset/agg dfs col_subset = [ 'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem' ] group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem'] # check units compare_df_units(fba, fbs_load) # fba fba = fba[col_subset] fba_agg = aggregator(fba, group_cols).reset_index(drop=True) fba_agg.rename(columns={ 'FlowAmount': 'FBA_amount', 'Unit': 'FBA_unit' }, inplace=True) # fbs fbs = fbs_load[col_subset] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={ 'FlowAmount': 'FBS_amount', 'Unit': 'FBS_unit' }, inplace=True) try: # merge FBA and FBS totals df_merge = fba_agg.merge(fbs_agg, how='left') df_merge['FlowAmount_difference'] = \ df_merge['FBA_amount'] - df_merge['FBS_amount'] df_merge['Percent_difference'] = \ (df_merge['FlowAmount_difference']/df_merge['FBA_amount']) * 100 # reorder df_merge = df_merge[[ 'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount', 'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference', 'Percent_difference' ]] df_merge = replace_NoneType_with_empty_cells(df_merge) # list of contexts and locations context_list = df_merge[['Context', 'Location']].values.tolist() # loop through the contexts and print results of comparison vLog.info( 'Comparing FBA %s %s subset to FBS results. ' 'Details in Validation Log', activity_set, source_attr['geoscale_to_use']) for i, j in context_list: df_merge_subset = \ df_merge[(df_merge['Context'] == i) & (df_merge['Location'] == j)].reset_index(drop=True) diff_per = df_merge_subset['Percent_difference'][0] if np.isnan(diff_per): vLog.info( 'FlowBySector FlowAmount for %s %s %s ' 'does not exist in the FBS', source_name, activity_set, i) continue # make reporting more manageable if abs(diff_per) > 0.01: diff_per = round(diff_per, 2) else: diff_per = round(diff_per, 6) # diff_units = df_merge_subset['FBS_unit'][0] if diff_per > 0: vLog.info( 'FlowBySector FlowAmount for %s %s %s at %s is %s%% ' 'less than the FlowByActivity FlowAmount', source_name, activity_set, i, j, str(abs(diff_per))) elif diff_per < 0: vLog.info( 'FlowBySector FlowAmount for %s %s %s at %s is %s%% ' 'more than the FlowByActivity FlowAmount', source_name, activity_set, i, j, str(abs(diff_per))) elif diff_per == 0: vLogDetailed.info( 'FlowBySector FlowAmount for ' '%s %s %s at %s is equal to the ' 'FlowByActivity FlowAmount', source_name, activity_set, i, j) # subset the df to include in the validation log # only print rows where the percent difference does not round to 0 df_v = df_merge[df_merge['Percent_difference'].apply( lambda x: round(x, 3) != 0)].reset_index(drop=True) # log output log.info( 'Save the comparison of FlowByActivity load to FlowBySector ' 'total FlowAmounts for %s in validation log file', activity_set) # if df not empty, print, if empty, print string if df_v.empty: vLogDetailed.info('Percent difference for %s all round to 0', activity_set) else: vLogDetailed.info( 'Comparison of FBA load to FBS total ' 'FlowAmounts for %s: ' '\n {}'.format(df_v.to_string()), activity_set) except: vLog.info('Error occurred when comparing total FlowAmounts ' 'for FlowByActivity and FlowBySector')
def compare_activity_to_sector_flowamounts(fba_load, fbs_load, activity_set, source_name, config): """ Function to compare the loaded flowbyactivity with the final flowbysector by activityname (if exists) to target sector level output, checking for data loss :param fba_load: df, FBA loaded and mapped using FEDEFL :param fbs_load: df, final FBS df :param activity_set: str, activity set :param source_name: str, source name :param config: dictionary, method yaml :return: printout data differences between loaded FBA and FBS output, save results as csv in local directory """ if check_activities_sector_like(source_name): vLog.debug('Not comparing loaded FlowByActivity to FlowBySector ' 'ratios for a dataset with sector-like activities because ' 'if there are modifications to flowamounts for a sector, ' 'then the ratios will be different') else: # subset fba df fba = fba_load[[ 'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fba.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ] fba_agg = aggregator(fba, group_cols) fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True) # subset fbs df fbs = fbs_load[[ 'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType', 'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fbs = replace_NoneType_with_empty_cells(fbs) fbs['ProducedLength'] = fbs['SectorProducedBy'].str.len() fbs['ConsumedLength'] = fbs['SectorConsumedBy'].str.len() fbs['SectorLength'] = fbs[['ProducedLength', 'ConsumedLength']].max(axis=1) fbs.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength' ] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True) # merge compare 1 and compare 2 df_merge = fba_agg.merge(fbs_agg, left_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], right_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], how='left') df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount'] # reorder df_merge = df_merge[[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio' ]] # keep onlyrows of specified sector length comparison = df_merge[df_merge['SectorLength'] == sector_level_key[ config['target_sector_level']]].reset_index(drop=True) tolerance = 0.01 comparison2 = comparison[(comparison['Ratio'] < 1 - tolerance) | (comparison['Ratio'] > 1 + tolerance)] if len(comparison2) > 0: vLog.info( 'There are %s combinations of flowable/context/sector ' 'length where the flowbyactivity to flowbysector ratio ' 'is less than or greater than 1 by %s', len(comparison2), str(tolerance)) # include df subset in the validation log # only print rows where flowamount ratio is less t # han 1 (round flowamountratio) df_v = comparison2[comparison2['Ratio'].apply( lambda x: round(x, 3) < 1)].reset_index(drop=True) # save to validation log log.info( 'Save the comparison of FlowByActivity load ' 'to FlowBySector ratios for %s in validation log', activity_set) # if df not empty, print, if empty, print string if df_v.empty: vLogDetailed.info('Ratios for %s all round to 1', activity_set) else: vLogDetailed.info( 'Comparison of FlowByActivity load to ' 'FlowBySector ratios for %s: ' '\n {}'.format(df_v.to_string()), activity_set)
def calculate_flowamount_diff_between_dfs(dfa_load, dfb_load): """ Calculate the differences in FlowAmounts between two dfs :param dfa_load: df, initial df :param dfb_load: df, modified df :return: df, comparing changes in flowamounts between 2 dfs """ # subset the dataframes, only keeping data for easy # comparison of flowamounts drop_cols = [ 'Year', 'MeasureofSpread', 'Spread', 'DistributionType', 'Min', 'Max', 'DataReliability', 'DataCollection' ] # drop cols and rename, ignore error if a df does not # contain a column to drop dfa = dfa_load.drop( drop_cols, axis=1, errors='ignore').rename(columns={'FlowAmount': 'FlowAmount_Original'}) dfb = dfb_load.drop( drop_cols, axis=1, errors='ignore').rename(columns={'FlowAmount': 'FlowAmount_Modified'}) # create df dict for modified dfs created in for loop df_list = [] for d in ['a', 'b']: df_name = f'df{d}' # assign new column of geoscale by which to aggregate vars()[df_name + '2'] = vars()[df_name].assign(geoscale=np.where( vars()[df_name]['Location'].apply(lambda x: x.endswith('000')), 'state', 'county')) vars()[df_name + '2'] = vars()[df_name + '2'].assign( geoscale=np.where(vars()[df_name + '2']['Location'] == '00000', 'national', vars()[df_name + '2']['geoscale'])) # ensure all nan/nones filled/match vars()[df_name + '2'] = \ replace_strings_with_NoneType(vars()[df_name+'2']) df_list.append(vars()[df_name + '2']) # merge the two dataframes df = df_list[0].merge(df_list[1], how='outer') # determine if any new data is negative dfn = df[df['FlowAmount_Modified'] < 0].reset_index(drop=True) if len(dfn) > 0: vLog.info('There are negative FlowAmounts in new dataframe, ' 'see Validation Log') vLogDetailed.info('Negative FlowAmounts in new dataframe: ' '\n {}'.format(dfn.to_string())) # Because code will sometimes change terminology, aggregate # data by context and flowable to compare df differences # subset df dfs = df[[ 'Flowable', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'FlowAmount_Original', 'FlowAmount_Modified', 'Unit', 'geoscale' ]] agg_cols = [ 'Flowable', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'Unit', 'geoscale' ] dfagg = dfs.groupby(agg_cols, dropna=False, as_index=False).agg({ 'FlowAmount_Original': sum, 'FlowAmount_Modified': sum }) # column calculating difference dfagg['FlowAmount_Difference'] = \ dfagg['FlowAmount_Modified'] - dfagg['FlowAmount_Original'] dfagg['Percent_Difference'] = (dfagg['FlowAmount_Difference'] / dfagg['FlowAmount_Original']) * 100 # drop rows where difference = 0 dfagg2 = dfagg[dfagg['FlowAmount_Difference'] != 0].reset_index(drop=True) if len(dfagg2) == 0: vLogDetailed.info('No FlowAmount differences') else: # subset df and aggregate, also print out the total # aggregate diff at the geoscale dfagg3 = replace_strings_with_NoneType(dfagg).drop(columns=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'FlowAmount_Difference', 'Percent_Difference' ]) dfagg4 = dfagg3.groupby(['Flowable', 'Context', 'Unit', 'geoscale'], dropna=False, as_index=False).agg({ 'FlowAmount_Original': sum, 'FlowAmount_Modified': sum }) # column calculating difference dfagg4['FlowAmount_Difference'] = \ dfagg4['FlowAmount_Modified'] - dfagg4['FlowAmount_Original'] dfagg4['Percent_Difference'] = (dfagg4['FlowAmount_Difference'] / dfagg4['FlowAmount_Original']) * 100 # drop rows where difference = 0 dfagg5 = dfagg4[dfagg4['FlowAmount_Difference'] != 0].reset_index( drop=True) vLogDetailed.info('Total FlowAmount differences between dataframes: ' '\n {}'.format(dfagg5.to_string(), index=False)) # save detail output in log file vLogDetailed.info('Total FlowAmount differences by Activity Columns: ' '\n {}'.format(dfagg2.to_string(), index=False))
def check_allocation_ratios(flow_alloc_df_load, activity_set, config, attr): """ Check for issues with the flow allocation ratios :param flow_alloc_df_load: df, includes 'FlowAmountRatio' column :param activity_set: str, activity set :param config: dictionary, method yaml :param attr: dictionary, activity set info :return: print out information regarding allocation ratios, save csv of results to local directory """ # if in the attr dictionary, merge columns are identified, # the merge columns need to be accounted for in the grouping/checking of # allocation ratios if 'allocation_merge_columns' in attr: subset_cols = [ 'FBA_Activity', 'Location', 'SectorLength', 'FlowAmountRatio' ] + attr['allocation_merge_columns'] groupcols = ['FBA_Activity', 'Location', 'SectorLength' ] + attr['allocation_merge_columns'] else: subset_cols = [ 'FBA_Activity', 'Location', 'SectorLength', 'FlowAmountRatio' ] groupcols = ['FBA_Activity', 'Location', 'SectorLength'] # create column of sector lengths flow_alloc_df =\ flow_alloc_df_load.assign( SectorLength=flow_alloc_df_load['Sector'].str.len()) # subset df flow_alloc_df2 = flow_alloc_df[subset_cols] # sum the flow amount ratios by location and sector length flow_alloc_df3 = \ flow_alloc_df2.groupby( groupcols, dropna=False, as_index=False).agg( {"FlowAmountRatio": sum}) # keep only rows of specified sector length flow_alloc_df4 = flow_alloc_df3[ flow_alloc_df3['SectorLength'] == sector_level_key[ config['target_sector_level']]].reset_index(drop=True) # keep data where the flowamountratio is greater than or # less than 1 by 0.005 tolerance = 0.01 flow_alloc_df5 = flow_alloc_df4[ (flow_alloc_df4['FlowAmountRatio'] < 1 - tolerance) | (flow_alloc_df4['FlowAmountRatio'] > 1 + tolerance)] if len(flow_alloc_df5) > 0: vLog.info( 'There are %s instances at a sector length of %s ' 'where the allocation ratio for a location is greater ' 'than or less than 1 by at least %s. See Validation Log', len(flow_alloc_df5), config["target_sector_level"], str(tolerance)) # add to validation log log.info( 'Save the summary table of flow allocation ratios for each ' 'sector length for %s in validation log', activity_set) # if df not empty, print, if empty, print string if flow_alloc_df5.empty: vLogDetailed.info('Flow allocation ratios for %s ' 'all round to 1', activity_set) else: vLogDetailed.info( 'Flow allocation ratios for %s: ' '\n {}'.format(flow_alloc_df5.to_string()), activity_set)
def allocate_usda_ers_mlu_land_in_urban_areas(df, attr, fbs_list): """ This function is used to allocate the USDA_ERS_MLU activity 'land in urban areas' to NAICS 2012 sectors. Allocation is dependent on assumptions defined in 'literature_values.py' as well as results from allocating 'EIA_CBECS_Land' and 'EIA_MECS_Land' to land based sectors. Methodology is based on the manuscript: Lin Zeng and Anu Ramaswami Impact of Locational Choices and Consumer Behaviors on Personal Land Footprints: An Exploration Across the Urban–Rural Continuum in the United States Environmental Science & Technology 2020 54 (6), 3091-3102 DOI: 10.1021/acs.est.9b06024 :param df: df, USDA ERA MLU Land :param attr: dictionary, attribute data from method yaml for activity set :param fbs_list: list, FBS dfs for activities created prior to the activity set that calls on this fxn :return: df, allocated USDS ERS MLU Land, FBS format """ # define sector column to base calculations sector_col = 'SectorConsumedBy' vLogDetailed.info('Assuming total land use from MECS and CBECS included ' 'in urban land area, so subtracting out calculated ' 'MECS and CBECS land from MLU urban land area') # read in the cbecs and mecs df from df_list for df_i in fbs_list: if (df_i['MetaSources'] == 'EIA_CBECS_Land').all(): cbecs = df_i elif (df_i['MetaSources'] == 'EIA_MECS_Land').all(): mecs = df_i # load the federal highway administration fees dictionary fha_dict = get_transportation_sectors_based_on_FHA_fees() df_fha = pd.DataFrame.from_dict( fha_dict, orient='index').rename(columns={'NAICS_2012_Code': sector_col}) # calculate total residential area from the American Housing Survey residential_land_area = get_area_of_urban_land_occupied_by_houses_2013() df_residential = df[df[sector_col] == 'F01000'] df_residential = df_residential.assign(FlowAmount=residential_land_area) # make an assumption about the percent of urban area that is open space openspace_multiplier = get_open_space_fraction_of_urban_area() df_openspace = df[df[sector_col] == '712190'] df_openspace = df_openspace.assign(FlowAmount=df_openspace['FlowAmount'] * openspace_multiplier) # sum all uses of urban area that are NOT transportation # first concat dfs for residential, openspace, commercial, # and manufacturing land use df_non_urban_transport_area = pd.concat( [df_residential, df_openspace, cbecs, mecs], sort=False, ignore_index=True) df_non_urban_transport_area = \ df_non_urban_transport_area[['Location', 'Unit', 'FlowAmount']] non_urban_transport_area_sum = df_non_urban_transport_area.groupby( ['Location', 'Unit'], as_index=False).agg({ 'FlowAmount': sum }).rename(columns={'FlowAmount': 'NonTransport'}) # compare units compare_df_units(df, df_non_urban_transport_area) # calculate total urban transportation by subtracting # calculated areas from total urban land df_transport = df.merge(non_urban_transport_area_sum, how='left') df_transport = df_transport.assign(FlowAmount=df_transport['FlowAmount'] - df_transport['NonTransport']) df_transport.drop(columns=['NonTransport'], inplace=True) # make an assumption about the percent of urban transport # area used by airports airport_multiplier = get_urban_land_use_for_airports() df_airport = df_transport[df_transport[sector_col] == '488119'] df_airport = df_airport.assign(FlowAmount=df_airport['FlowAmount'] * airport_multiplier) # make an assumption about the percent of urban transport # area used by railroads railroad_multiplier = get_urban_land_use_for_railroads() df_railroad = df_transport[df_transport[sector_col] == '482112'] df_railroad = df_railroad.assign(FlowAmount=df_railroad['FlowAmount'] * railroad_multiplier) # further allocate the remaining urban transportation area using # Federal Highway Administration fees # first subtract area for airports and railroads air_rail_area = pd.concat([df_airport, df_railroad], sort=False) air_rail_area = air_rail_area[['Location', 'Unit', 'FlowAmount']] air_rail_area_sum = air_rail_area.groupby( ['Location', 'Unit'], as_index=False).agg({ 'FlowAmount': sum }).rename(columns={'FlowAmount': 'AirRail'}) df_highway = df_transport.merge(air_rail_area_sum, how='left') df_highway = df_highway.assign(FlowAmount=df_highway['FlowAmount'] - df_highway['AirRail']) df_highway.drop(columns=['AirRail'], inplace=True) # add fed highway administration fees df_highway2 = df_highway.merge(df_fha, how='left') df_highway2 = df_highway2[df_highway2['ShareOfFees'].notna()] df_highway2 = df_highway2.assign(FlowAmount=df_highway2['FlowAmount'] * df_highway2['ShareOfFees']) df_highway2.drop(columns=['ShareOfFees'], inplace=True) # concat all df subsets allocated_urban_areas_df = pd.concat( [df_residential, df_openspace, df_airport, df_railroad, df_highway2], ignore_index=True, sort=False).reset_index(drop=True) return allocated_urban_areas_df
def calculate_net_public_supply(df_load): """ USGS Provides info on the quantity of public supply withdrawals that are delivered to domestic use. The USGS PS withdrawals are not necessarily greater than/equal to the Domestic deliveries because water can be withdrawn in one county and delivered in another (water can also cross state lines). Therefore, can/do end up with NEGATIVE net public supply values and PS water should only be used at a national level Domestic deliveries are subtracted from public supply. An assumption is made that PS deliveries to domestic is fresh water. The national level data can then be allocated to end users using the BEA Use tables. :param df_load: USGS df :return: df with net public supply values """ # subset into 2 dfs, one that contains PS data and one that does not df1 = df_load[(df_load[fba_activity_fields[0]] == 'Public Supply') | (df_load[fba_activity_fields[1]] == 'Public Supply')] df2 = df_load[(df_load[fba_activity_fields[0]] != 'Public Supply') & (df_load[fba_activity_fields[1]] != 'Public Supply')] # drop all deliveries to thermo and industrial # (not enough states report the data to make usable) df1_sub = df1[~df1[fba_activity_fields[1]].isin([ 'Industrial', 'Thermoelectric Power', 'Thermoelectric Power Closed-loop cooling', 'Thermoelectric Power Once-through cooling' ])] # drop duplicate info of "Public Supply deliveries to" df1_sub = df1_sub.loc[~df1_sub['Description'].str. contains("Public Supply total deliveries")] df1_sub = df1_sub.loc[~df1_sub['Description'].str. contains("deliveries from public supply")] # calculate data drop vLogDetailed.info('Dropping rows that contain "deliveries from public ' 'supply" to avoid double counting with rows of "Public ' 'Supply deliveries to"') calculate_flowamount_diff_between_dfs(df1, df1_sub) # drop county level values because cannot use county data vLogDetailed.info('Dropping county level public supply withdrawals ' 'because will end up with negative values due to ' 'instances of water deliveries coming from surrounding ' 'counties') df1_sub = df1_sub[df1_sub['Location'].apply( lambda x: x[2:6] == '000')].reset_index(drop=True) # df of ps delivered and ps withdrawn and us total df_d = df1_sub[df1_sub[fba_activity_fields[0]] == 'Public Supply'] df_w = df1_sub[df1_sub[fba_activity_fields[1]] == 'Public Supply'] df_us = df1_sub[df1_sub['Location'] == '00000'] # split consumed further into fresh water (assumption domestic # deliveries are freshwater) assumption that water withdrawal taken # equally from ground and surface df_w1 = df_w[(df_w['FlowName'] == 'fresh') & (df_w['Compartment'] != 'total')] df_w2 = df_w[(df_w['FlowName'] == 'fresh') & (df_w['Compartment'] == 'total')] # compare units compare_df_units(df_w1, df_w2) df_wm = pd.merge(df_w1, df_w2[['FlowAmount', 'Location', 'Unit']], how='left', left_on=['Location', 'Unit'], right_on=['Location', 'Unit']) df_wm = df_wm.rename(columns={ "FlowAmount_x": "FlowAmount", "FlowAmount_y": "FlowTotal" }) # compare units compare_df_units(df_wm, df_d) # merge the deliveries to domestic df_w_modified = pd.merge(df_wm, df_d[['FlowAmount', 'Location']], how='left', left_on='Location', right_on='Location') df_w_modified = df_w_modified.rename(columns={ "FlowAmount_x": "FlowAmount", "FlowAmount_y": "DomesticDeliveries" }) # create flowratio for ground/surface df_w_modified.loc[:, 'FlowRatio'] = \ df_w_modified['FlowAmount'] / df_w_modified['FlowTotal'] # calculate new, net total public supply withdrawals # will end up with negative values due to instances of water # deliveries coming form surrounding counties df_w_modified.loc[:, 'FlowAmount'] = \ df_w_modified['FlowAmount'] - (df_w_modified['FlowRatio'] * df_w_modified['DomesticDeliveries']) net_ps = df_w_modified.drop(columns=["FlowTotal", "DomesticDeliveries"]) # compare units compare_df_units(df_d, net_ps) # because assuming domestic is all fresh, drop # flowname/flowable/Compartment/context # and instead use those column data from the net_ps df df_d_modified = df_d.drop( columns=['FlowName', 'Flowable', 'Compartment', 'Context', 'FlowUUID']) # Also allocate to ground/surface from state ratios df_d_modified = pd.merge(df_d_modified, net_ps[[ 'FlowName', 'Flowable', 'Compartment', 'Context', 'FlowUUID', 'Location', 'FlowRatio' ]], how='left', left_on='Location', right_on='Location') df_d_modified.loc[:, 'FlowAmount'] = \ df_d_modified['FlowAmount'] * df_d_modified['FlowRatio'] df_d_modified = df_d_modified.drop(columns=["FlowRatio"]) net_ps = net_ps.drop(columns=["FlowRatio"]) # concat dfs back (non-public supply, public supply # deliveries, net ps withdrawals) modified_ps = pd.concat([df2, df_d_modified, net_ps, df_us], ignore_index=True) return modified_ps
def equally_allocate_parent_to_child_naics(df_load, target_sector_level): """ Determine rows of data that will be lost if subset data at target sector level. Equally allocate parent NAICS to child NAICS where child NAICS missing :param df_load: df, FBS format :param target_sector_level: str, target NAICS level for FBS output :return: df, with all child NAICS at target sector level """ # exclude nonsectors df = replace_NoneType_with_empty_cells(df_load) rows_lost = pd.DataFrame() for i in range(2, sector_level_key[target_sector_level]): # create df of i length df_x1 = \ df.loc[(df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]] == '')] df_x2 = \ df.loc[(df[fbs_activity_fields[0]] == '') & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x3 = \ df.loc[(df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False) if len(df_x) > 0: # create df of i + 1 length df_y1 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y2 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False) # create temp sector columns in df y, that are i digits in length df_y.loc[:, 'spb_tmp'] = \ df_y[fbs_activity_fields[0]].apply(lambda x: x[0:i]) df_y.loc[:, 'scb_tmp'] = \ df_y[fbs_activity_fields[1]].apply(lambda x: x[0:i]) # don't modify household sector lengths or gov't transport df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'}) # merge the two dfs if 'Context' in df_y.columns: merge_cols = [ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year' ] else: merge_cols = [ 'Class', 'FlowType', 'Location', 'LocationSystem', 'Unit', 'Year' ] df_m = pd.merge(df_x, df_y[merge_cols + ['spb_tmp', 'scb_tmp']], how='left', left_on=merge_cols + ['SectorProducedBy', 'SectorConsumedBy'], right_on=merge_cols + ['spb_tmp', 'scb_tmp']) # extract the rows that are not disaggregated to more # specific naics rl = df_m[(df_m['scb_tmp'].isnull()) & (df_m['spb_tmp'].isnull())].reset_index(drop=True) # clean df rl = replace_strings_with_NoneType(rl) rl_list = rl[['SectorProducedBy', 'SectorConsumedBy']]\ .drop_duplicates().values.tolist() # match sectors with target sector length sectors # import cw and subset to current sector length # and target sector length cw_load = load_crosswalk('sector_length') nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = \ cw.groupby(nlength)[nlength].transform('count') # merge df & conditionally replace sector produced/consumed columns rl_m = pd.merge(rl, cw, how='left', left_on=[fbs_activity_fields[0]], right_on=[nlength]) rl_m.loc[rl_m[fbs_activity_fields[0]] != '', fbs_activity_fields[0]] = rl_m[target_sector_level] rl_m = rl_m.drop(columns=[nlength, target_sector_level]) rl_m2 = pd.merge(rl_m, cw, how='left', left_on=[fbs_activity_fields[1]], right_on=[nlength]) rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '', fbs_activity_fields[1]] = rl_m2[target_sector_level] rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level]) # create one sector count column rl_m2['sector_count_x'] = \ rl_m2['sector_count_x'].fillna(rl_m2['sector_count_y']) rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'}) rl_m3 = rl_m3.drop(columns=['sector_count_y']) # calculate new flow amounts, based on sector count, # allocating equally to the new sector length codes rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count'] rl_m3 = rl_m3.drop(columns=['sector_count', 'spb_tmp', 'scb_tmp']) # append to df if len(rl) != 0: vLogDetailed.warning( 'Data found at %s digit NAICS not ' 'represented in current data subset: ' '{}'.format(' '.join(map(str, rl_list))), str(i)) rows_lost = rows_lost.append(rl_m3, ignore_index=True) if len(rows_lost) != 0: vLogDetailed.info( 'Allocating FlowAmounts equally to ' 'each %s associated with the sectors previously ' 'dropped', target_sector_level) # add rows of missing data to the fbs sector subset df_w_lost_data = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_w_lost_data = replace_strings_with_NoneType(df_w_lost_data) return df_w_lost_data