def prepare_stewi_fbs(df, inventory_dict, NAICS_level, geo_scale): from stewi.globals import weighted_average # update location to appropriate geoscale prior to aggregating df.dropna(subset=['Location'], inplace=True) df['Location'] = df['Location'].astype(str) df = update_geoscale(df, geo_scale) # assign grouping variables based on desired geographic aggregation level grouping_vars = ['NAICS_lvl', 'FlowName', 'Compartment', 'Location'] if 'MetaSources' in df: grouping_vars.append('MetaSources') # aggregate by NAICS code, FlowName, compartment, and geographic level fbs = df.groupby(grouping_vars).agg({ 'FlowAmount': 'sum', 'Year': 'first', 'Unit': 'first' }) # add reliability score fbs['DataReliability'] = weighted_average(df, 'DataReliability', 'FlowAmount', grouping_vars) fbs.reset_index(inplace=True) # apply flow mapping fbs = map_elementary_flows(fbs, list(inventory_dict.keys())) # rename columns to match flowbysector format fbs = fbs.rename(columns={"NAICS_lvl": "SectorProducedBy"}) # add hardcoded data, depending on the source data, some of these fields may need to change fbs['Class'] = 'Chemicals' fbs['SectorConsumedBy'] = 'None' fbs['SectorSourceName'] = 'NAICS_2012_Code' fbs['FlowType'] = 'ELEMENTARY_FLOW' fbs = assign_fips_location_system(fbs, list(inventory_dict.values())[0]) # add missing flow by sector fields fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) fbs = check_for_missing_sector_data(fbs, NAICS_level) # sort dataframe and reset index fbs = fbs.sort_values(list( flow_by_sector_fields.keys())).reset_index(drop=True) # check the sector codes to make sure NAICS 2012 codes fbs = replace_naics_w_naics_2012(fbs, 'NAICS_2012_Code') return fbs
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbs_list = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v) if v['data_format'] == 'FBA': # ensure correct datatypes and that all fields exist flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # clean up fba, if specified in yaml if v["clean_fba_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity") flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows) # if activity_sets are specified in a file, call them here if 'activity_set_file' in v: aset_names = pd.read_csv(flowbysectoractivitysetspath + v['activity_set_file'], dtype=str) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities if 'activity_set_file' in v: names = aset_names[aset_names['activity_set'] == aset]['name'] else: names = attr['names'] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset fba data by activity flows_subset = flows[ (flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))].reset_index( drop=True) # extract relevant geoscale data or aggregate existing data log.info("Subsetting/aggregating dataframe to " + attr['allocation_from_scale'] + " geoscale") flows_subset_geo = subset_df_by_geoscale( flows_subset, v['geoscale_to_use'], attr['allocation_from_scale']) # Add sectors to df activity, depending on level of specified sector aggregation log.info("Adding sectors to " + k) flow_subset_wsec = add_sectors_to_flowbyactivity( flows_subset_geo, sectorsourcename=method['target_sector_source'], allocationmethod=attr['allocation_method']) # clean up fba with sectors, if specified in yaml if v["clean_fba_w_sec_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])( flow_subset_wsec, attr=attr) # map df to elementary flows log.info("Mapping flows in " + k + ' to federal elementary flow list') if 'fedefl_mapping' in v: mapping_files = v['fedefl_mapping'] else: mapping_files = k flow_subset_mapped = map_elementary_flows( flow_subset_wsec, mapping_files) # clean up mapped fba with sectors, if specified in yaml if "clean_mapped_fba_w_sec_df_fxn" in v: log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_mapped = getattr( sys.modules[__name__], v["clean_mapped_fba_w_sec_df_fxn"])(flow_subset_mapped, attr, method) # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors') fbs = flow_subset_mapped.copy() # for each activity, if activities are not sector like, check that there is no data loss if load_source_catalog( )[k]['sector-like_activities'] is False: activity_list = [] for n in names: log.info('Checking for ' + n + ' at ' + method['target_sector_level']) fbs_subset = fbs[( (fbs[fba_activity_fields[0]] == n) & (fbs[fba_activity_fields[1]] == n)) | (fbs[fba_activity_fields[0]] == n) | (fbs[fba_activity_fields[1]] == n )].reset_index(drop=True) fbs_subset = check_if_losing_sector_data( fbs_subset, method['target_sector_level']) activity_list.append(fbs_subset) fbs = pd.concat(activity_list, ignore_index=True) # if allocation method for an activity set requires a specific function due to the complicated nature # of the allocation, call on function here elif attr['allocation_method'] == 'allocation_function': log.info( 'Calling on function specified in method yaml to allocate ' + ', '.join(map(str, names)) + ' to sectors') fbs = getattr(sys.modules[__name__], attr['allocation_source'])( flow_subset_mapped, attr, fbs_list) else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity( flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year'] ]).reset_index(drop=True) # clean df and harmonize unites fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict) fba_allocation = harmonize_units(fba_allocation) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale( fba_allocation, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary fba_allocation = subset_df_by_geoscale( fba_allocation, attr['allocation_from_scale'], v['geoscale_to_use']) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['FlowName'].isin( attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin( attr['allocation_compartment'])] # cleanup the fba allocation df, if necessary if 'clean_allocation_fba' in attr: log.info("Cleaning " + attr['allocation_source']) fba_allocation = getattr(sys.modules[__name__], attr["clean_allocation_fba"])( fba_allocation, attr=attr) # reset index fba_allocation = fba_allocation.reset_index(drop=True) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation_wsec = add_sectors_to_flowbyactivity( fba_allocation, sectorsourcename=method['target_sector_source']) # call on fxn to further clean up/disaggregate the fba allocation data, if exists if 'clean_allocation_fba_w_sec' in attr: log.info("Further disaggregating sectors in " + attr['allocation_source']) fba_allocation_wsec = getattr( sys.modules[__name__], attr["clean_allocation_fba_w_sec"])( fba_allocation_wsec, attr=attr, method=method) # subset fba datasets to only keep the sectors associated with activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset( fba_allocation_wsec, k, names, flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info( "Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper( fba_allocation_subset, attr, method, v) # create flow allocation ratios for each activity # if load_source_catalog()[k]['sector-like_activities'] flow_alloc_list = [] group_cols = fba_mapped_default_grouping_fields group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] for n in names: log.info("Creating allocation ratios for " + n) fba_allocation_subset_2 = get_fba_allocation_subset( fba_allocation_subset, k, [n], flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) if len(fba_allocation_subset_2) == 0: log.info("No data found to allocate " + n) else: flow_alloc = allocate_by_sector( fba_allocation_subset_2, k, attr['allocation_source'], attr['allocation_method'], group_cols, flowSubsetMapped=flow_subset_mapped) flow_alloc = flow_alloc.assign(FBA_Activity=n) flow_alloc_list.append(flow_alloc) flow_allocation = pd.concat(flow_alloc_list, ignore_index=True) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of " + attr['allocation_source']) flow_allocation = collapse_activity_fields(flow_allocation) # check for issues with allocation ratios check_allocation_ratios(flow_allocation, aset, k, method_name) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_mapped = flow_subset_mapped.loc[ (flow_subset_mapped[fbs_activity_fields[0]]. isin(sector_list)) | (flow_subset_mapped[fbs_activity_fields[1]]. isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info( "Checking if flowbyactivity and allocation dataframes use the same location systems" ) check_if_location_systems_match(flow_subset_mapped, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_mapped.merge( flow_allocation[[ 'Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity' ]], left_on=[ 'Location', 'SectorProducedBy', 'ActivityProducedBy' ], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') fbs = fbs.merge( flow_allocation[[ 'Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity' ]], left_on=[ 'Location', 'SectorConsumedBy', 'ActivityConsumedBy' ], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') # merge the flowamount columns fbs.loc[:, 'FlowAmountRatio'] = fbs[ 'FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) # fill null rows with 0 because no allocation info fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo' # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations") # check_if_data_exists_for_same_geoscales(fbs, k, attr['names']) # drop rows where there is no allocation data fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs[ 'FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=[ 'Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'FBA_Activity_x', 'FBA_Activity_y' ]) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # define grouping columns dependent on sectors being activity-like or not if load_source_catalog()[k]['sector-like_activities'] is False: groupingcols = fbs_grouping_fields_w_activities groupingdict = flow_by_sector_fields_w_activity else: groupingcols = fbs_default_grouping_fields groupingdict = flow_by_sector_fields # clean df fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict) # aggregate df geographically, if necessary # todo: replace with fxn return_from_scale log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[ attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs_geo_agg = agg_by_geoscale(fbs, from_scale, to_scale, groupingcols) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols) # add missing naics5/6 when only one naics5/6 associated with a naics4 fbs_agg = sector_disaggregation(fbs_sec_agg, groupingdict) # check if any sector information is lost before reaching the target sector length, if so, # allocate values equally to disaggregated sectors log.info('Checking for data at ' + method['target_sector_level']) fbs_agg_2 = check_if_losing_sector_data( fbs_agg, method['target_sector_level']) # compare flowbysector with flowbyactivity # todo: modify fxn to work if activities are sector like in df being allocated if load_source_catalog()[k]['sector-like_activities'] is False: check_for_differences_between_fba_load_and_fbs_output( flow_subset_mapped, fbs_agg_2, aset, k, method_name) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows fbs_1 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) fbs_2 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isnull())].reset_index( drop=True) fbs_3 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isnull()) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3]) # drop activity columns fbs_sector_subset = fbs_sector_subset.drop( ['ActivityProducedBy', 'ActivityConsumedBy'], axis=1, errors='ignore') # save comparison of FBA total to FBS total for an activity set compare_fba_load_and_fbs_output_totals(flows_subset_geo, fbs_sector_subset, aset, k, method_name, attr, method, mapping_files) log.info( "Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbs_list.append(fbs_sector_subset) else: # if the loaded flow dt is already in FBS format, append directly to list of FBS log.info("Append " + k + " to FBS list") # ensure correct field datatypes and add any missing fields flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict) fbs_list.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbs_list, ignore_index=True, sort=False) log.info("Clean final dataframe") # aggregate df as activities might have data for the same specified sector length fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") # add missing fields, ensure correct data type, reorder columns fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
] activities = aset_names[aset_names['activity_set'].isin(asets)]['name'] test_fba = test_fba[test_fba['ActivityProducedBy'].isin(activities)] return test_fba def get_fbs_subset(name): test_fbs = flowsa.getFlowBySector(name) return test_fbs if __name__ == '__main__': fba = get_fba_subset(fba_source, fba_year, fba_class) fba = agg_by_geoscale(fba, fba_agg,'national', fba_default_grouping_fields) fba = fba[['FlowName','ActivityProducedBy','FlowAmount','Unit','Compartment']] fba = map_elementary_flows(fba, 'NEI') fba_pivot = pd.pivot_table(fba, values = 'FlowAmount', index =['Flowable'], columns='ActivityProducedBy', aggfunc='sum', margins = True).reset_index() fbs = get_fbs_subset(fbs_source) fbs = fbs[['Flowable','SectorProducedBy','FlowAmount']] fbs_pivot = pd.pivot_table(fbs, values = 'FlowAmount', index =['Flowable'], columns='SectorProducedBy', aggfunc='sum', margins = True).reset_index() fba = fba.groupby('Flowable').agg({'FlowAmount': 'sum'}) fba.rename(columns={'FlowAmount':'FBA_amount'}, inplace=True) fbs = fbs.groupby('Flowable').agg({'FlowAmount': 'sum'}) fbs.rename(columns={'FlowAmount':'FBS_amount'}, inplace=True) comparison = fba.merge(fbs, how='outer', on ='Flowable') comparison['Ratio'] = comparison['FBS_amount'] / comparison ['FBA_amount']
def compare_fba_load_and_fbs_output_totals(fba_load, fbs_load, activity_set, source_name, method_name, attr, method, mapping_files): """ Function to compare the loaded flowbyactivity total with the final flowbysector output total :param df: :return: """ from flowsa.flowbyfunctions import subset_df_by_geoscale, sector_aggregation from flowsa.common import load_source_catalog from flowsa.mapping import map_elementary_flows log.info( 'Comparing loaded FlowByActivity FlowAmount total to subset FlowBySector FlowAmount total' ) # load source catalog cat = load_source_catalog() src_info = cat[source_name] # extract relevant geoscale data or aggregate existing data fba = subset_df_by_geoscale(fba_load, attr['allocation_from_scale'], method['target_geoscale']) # map loaded fba fba = map_elementary_flows(fba, mapping_files, keep_unmapped_rows=True) if src_info['sector-like_activities']: # if activities are sector-like, run sector aggregation and then subset df to only keep NAICS2 fba = fba[[ 'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'Location', 'LocationSystem' ]] # rename the activity cols to sector cols for purposes of aggregation fba = fba.rename( columns={ 'ActivityProducedBy': 'SectorProducedBy', 'ActivityConsumedBy': 'SectorConsumedBy' }) group_cols_agg = [ 'Class', 'Context', 'Unit', 'Location', 'LocationSystem', 'SectorProducedBy', 'SectorConsumedBy' ] fba = sector_aggregation(fba, group_cols_agg) # subset fba to only include NAICS2 fba = replace_NoneType_with_empty_cells(fba) fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2) | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)] # subset/agg dfs col_subset = [ 'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem' ] group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem'] # fba fba = fba[col_subset] fba_agg = aggregator(fba, group_cols).reset_index(drop=True) fba_agg.rename(columns={ 'FlowAmount': 'FBA_amount', 'Unit': 'FBA_unit' }, inplace=True) # fbs fbs = fbs_load[col_subset] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={ 'FlowAmount': 'FBS_amount', 'Unit': 'FBS_unit' }, inplace=True) try: # merge FBA and FBS totals df_merge = fba_agg.merge(fbs_agg, how='left') df_merge['FlowAmount_difference'] = df_merge['FBA_amount'] - df_merge[ 'FBS_amount'] df_merge['Percent_difference'] = (df_merge['FlowAmount_difference'] / df_merge['FBA_amount']) * 100 # reorder df_merge = df_merge[[ 'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount', 'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference', 'Percent_difference' ]] df_merge = replace_NoneType_with_empty_cells(df_merge) # list of contexts context_list = df_merge['Context'].to_list() # loop through the contexts and print results of comparison for i in context_list: df_merge_subset = df_merge[df_merge['Context'] == i].reset_index( drop=True) diff_per = df_merge_subset['Percent_difference'][0] # make reporting more manageable if abs(diff_per) > 0.001: diff_per = round(diff_per, 2) else: diff_per = round(diff_per, 6) diff_units = df_merge_subset['FBS_unit'][0] if diff_per > 0: log.info('The total FlowBySector FlowAmount for ' + source_name + ' ' + activity_set + ' ' + i + ' is ' + str(abs(diff_per)) + '% less than the total FlowByActivity FlowAmount') else: log.info('The total FlowBySector FlowAmount for ' + source_name + ' ' + activity_set + ' ' + i + ' is ' + str(abs(diff_per)) + '% more than the total FlowByActivity FlowAmount') # save csv to output folder log.info( 'Save the comparison of FlowByActivity load to FlowBySector total FlowAmounts for ' + activity_set + ' in output folder') # output data at all sector lengths df_merge.to_csv(outputpath + "FlowBySectorMethodAnalysis/" + method_name + '_' + source_name + "_FBA_total_to_FBS_total_FlowAmount_comparison_" + activity_set + ".csv", index=False) except: log.info( 'Error occured when comparing total FlowAmounts for FlowByActivity and FlowBySector' ) return None
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbss = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v) if v['data_format'] == 'FBA': # clean up fba, if specified in yaml if v["clean_fba_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity") flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows) flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities names = attr['names'] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # check if flowbyactivity data exists at specified geoscale to use flow_subset_list = [] for n in names: # subset usgs data by activity flow_subset = flows[(flows[fba_activity_fields[0]] == n) | (flows[fba_activity_fields[1]] == n)].reset_index(drop=True) log.info("Checking if flowbyactivity data exists for " + n + " at the " + v['geoscale_to_use'] + ' level') geocheck = check_if_data_exists_at_geoscale(flow_subset, v['geoscale_to_use'], activitynames=n) # aggregate geographically to the scale of the allocation dataset if geocheck == "Yes": activity_from_scale = v['geoscale_to_use'] else: # if activity does not exist at specified geoscale, issue warning and use data at less aggregated # geoscale, and sum to specified geoscale log.info("Checking if flowbyactivity data exists for " + n + " at a less aggregated level") activity_from_scale = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, v['geoscale_to_use'], n) activity_to_scale = attr['allocation_from_scale'] # if df is less aggregated than allocation df, aggregate usgs activity to allocation geoscale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, n) # else, aggregate to geoscale want to use elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]: log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use']) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, n) # else, if usgs is more aggregated than allocation table, filter relevant rows else: log.info("Subsetting " + activity_from_scale + " data") flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, n) # Add sectors to df activity, depending on level of specified sector aggregation log.info("Adding sectors to " + k + " for " + n) flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr['activity_sector_aggregation']) flow_subset_list.append(flow_subset_wsec) flow_subset_wsec = pd.concat(flow_subset_list, sort=False).reset_index(drop=True) # clean up fba with sectors, if specified in yaml if v["clean_fba_w_sec_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])(flow_subset_wsec, attr) # map df to elementary flows - commented out until mapping complete log.info("Mapping flows in " + k + ' to federal elementary flow list') flow_subset_wsec = map_elementary_flows(flow_subset_wsec, k) # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors') fbs = flow_subset_wsec.copy() else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year']]).reset_index(drop=True) fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin(attr['allocation_compartment'])] # cleanup the fba allocation df, if necessary if 'clean_allocation_fba' in attr: log.info("Cleaning " + attr['allocation_source']) fba_allocation = getattr(sys.modules[__name__], attr["clean_allocation_fba"])(fba_allocation) # reset index fba_allocation = fba_allocation.reset_index(drop=True) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale(fba_allocation, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary from_scale = attr['allocation_from_scale'] to_scale = v['geoscale_to_use'] # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale if fips_number_key[from_scale] > fips_number_key[to_scale]: fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale else: fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation = add_sectors_to_flowbyactivity(fba_allocation, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr['allocation_sector_aggregation']) # subset fba datsets to only keep the sectors associated with activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of " + attr['allocation_source']) fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset) # drop columns fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity']) # call on fxn to further disaggregate the fba allocation data, if exists if 'allocation_disaggregation_fxn' in attr: log.info("Futher disaggregating sectors in " + attr['allocation_source']) fba_allocation_subset = getattr(sys.modules[__name__], attr["allocation_disaggregation_fxn"])(fba_allocation_subset, attr) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info("Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr) # create flow allocation ratios log.info("Creating allocation ratios for " + attr['allocation_source']) flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method']) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_wsec = flow_subset_wsec.loc[ (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_wsec, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_wsec.merge( flow_allocation[['Location', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'SectorProducedBy'], right_on=['Location', 'Sector'], how='left') fbs = fbs.merge( flow_allocation[['Location', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'SectorConsumedBy'], right_on=['Location', 'Sector'], how='left') # merge the flowamount columns fbs.loc[:, 'FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo' # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations") # check_if_data_exists_for_same_geoscales(fbs, k, attr['names']) # drop rows where there is no allocation data fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy']) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # clean df fbs = clean_df(fbs, flow_by_sector_fields, fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs = sector_aggregation(fbs, fbs_default_grouping_fields) # add missing naics5/6 when only one naics5/6 associated with a naics4 fbs = sector_disaggregation(fbs) # test agg by sector # sector_agg_comparison = sector_flow_comparision(fbs) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # add any non-NAICS sectors used with NAICS sector_list = add_non_naics_sectors(sector_list, method['target_sector_level']) # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows fbs_1 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) & (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) fbs_2 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) | (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2], sort=False) # set source name fbs_sector_subset.loc[:, 'SectorSourceName'] = method['target_sector_source'] log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbss.append(fbs_sector_subset) else: # if the loaded flow dt is already in FBS format, append directly to list of FBS log.info("Append " + k + " to FBS list") fbss.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbss, ignore_index=True, sort=False) log.info("Clean final dataframe") # aggregate df as activities might have data for the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") fbss = fbss.replace({'nan': None}) # add missing fields, ensure correct data type, reorder columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def stewicombo_to_sector(inventory_dict, NAICS_level, geo_scale, compartments): """ Returns emissions from stewicombo in fbs format :param inventory_dict: a dictionary of inventory types and years (e.g., {'NEI':'2017', 'TRI':'2017'}) :param NAICS_level: desired NAICS aggregation level, using sector_level_key, should match target_sector_level :param geo_scale: desired geographic aggregation level ('national', 'state', 'county'), should match target_geoscale :param compartments: list of compartments to include (e.g., 'water', 'air', 'soil'), use None to include all compartments """ from stewi.globals import output_dir as stw_output_dir from stewi.globals import weighted_average import stewi import stewicombo import facilitymatcher from stewicombo.overlaphandler import remove_default_flow_overlaps from stewicombo.globals import addChemicalMatches from facilitymatcher import output_dir as fm_output_dir NAICS_level_value = sector_level_key[NAICS_level] ## run stewicombo to combine inventories, filter for LCI, remove overlap df = stewicombo.combineFullInventories(inventory_dict, filter_for_LCI=True, remove_overlap=True, compartments=compartments) df.drop(columns=['SRS_CAS', 'SRS_ID', 'FacilityIDs_Combined'], inplace=True) facility_mapping = pd.DataFrame() # load facility data from stewi output directory, keeping only the facility IDs, and geographic information inventory_list = list(inventory_dict.keys()) for i in range(len(inventory_dict)): # define inventory name as inventory type + inventory year (e.g., NEI_2017) inventory_name = inventory_list[i] + '_' + list( inventory_dict.values())[i] facilities = pd.read_csv(stw_output_dir + 'facility/' + inventory_name + '.csv', usecols=['FacilityID', 'State', 'County'], dtype={'FacilityID': str}) if len(facilities[facilities.duplicated(subset='FacilityID', keep=False)]) > 0: log.info('Duplicate facilities in ' + inventory_name + ' - keeping first listed') facilities.drop_duplicates(subset='FacilityID', keep='first', inplace=True) facility_mapping = facility_mapping.append(facilities) # Apply FIPS to facility locations facility_mapping = apply_county_FIPS(facility_mapping) ## merge dataframes to assign facility information based on facility IDs df = pd.merge(df, facility_mapping, how='left', on='FacilityID') ## Access NAICS From facility matcher and assign based on FRS_ID all_NAICS = facilitymatcher.get_FRS_NAICSInfo_for_facility_list( frs_id_list=None, inventories_of_interest_list=inventory_list) all_NAICS = all_NAICS.loc[all_NAICS['PRIMARY_INDICATOR'] == 'PRIMARY'] all_NAICS.drop(columns=['PRIMARY_INDICATOR'], inplace=True) all_NAICS = naics_expansion(all_NAICS) if len(all_NAICS[all_NAICS.duplicated(subset=['FRS_ID', 'Source'], keep=False)]) > 0: log.info('Duplicate primary NAICS reported - keeping first') all_NAICS.drop_duplicates(subset=['FRS_ID', 'Source'], keep='first', inplace=True) df = pd.merge(df, all_NAICS, how='left', on=['FRS_ID', 'Source']) # add levelized NAICS code prior to aggregation df['NAICS_lvl'] = df['NAICS'].str[0:NAICS_level_value] ## subtract emissions for air transportation from airports in NEI airport_NAICS = '4881' air_transportation_SCC = '2275020000' air_transportation_naics = '481111' if 'NEI' in inventory_list: log.info('Reassigning emissions from air transportation from airports') # obtain and prepare SCC dataset df_airplanes = stewi.getInventory('NEI', inventory_dict['NEI'], stewiformat='flowbySCC') df_airplanes = df_airplanes[df_airplanes['SCC'] == air_transportation_SCC] df_airplanes['Source'] = 'NEI' df_airplanes = addChemicalMatches(df_airplanes) df_airplanes = remove_default_flow_overlaps(df_airplanes, SCC=True) df_airplanes.drop(columns=['SCC'], inplace=True) facility_mapping_air = df[['FacilityID', 'NAICS']] facility_mapping_air.drop_duplicates(keep='first', inplace=True) df_airplanes = df_airplanes.merge(facility_mapping_air, how='left', on='FacilityID') df_airplanes['Year'] = inventory_dict['NEI'] df_airplanes = df_airplanes[ df_airplanes['NAICS'].str[0:len(airport_NAICS)] == airport_NAICS] # subtract airplane emissions from airport NAICS at individual facilities df_planeemissions = df_airplanes[[ 'FacilityID', 'FlowName', 'FlowAmount' ]] df_planeemissions.rename(columns={'FlowAmount': 'PlaneEmissions'}, inplace=True) df = df.merge(df_planeemissions, how='left', on=['FacilityID', 'FlowName']) df[['PlaneEmissions']] = df[['PlaneEmissions']].fillna(value=0) df['FlowAmount'] = df['FlowAmount'] - df['PlaneEmissions'] df.drop(columns=['PlaneEmissions'], inplace=True) # add airplane emissions under air transport NAICS df_airplanes.loc[:, 'NAICS_lvl'] = air_transportation_naics[ 0:NAICS_level_value] df = pd.concat([df, df_airplanes], ignore_index=True) # update location to appropriate geoscale prior to aggregating df.dropna(subset=['Location'], inplace=True) df['Location'] = df['Location'].astype(str) df = update_geoscale(df, geo_scale) # assign grouping variables based on desired geographic aggregation level grouping_vars = ['NAICS_lvl', 'FlowName', 'Compartment', 'Location'] # aggregate by NAICS code, FlowName, compartment, and geographic level fbs = df.groupby(grouping_vars).agg({ 'FlowAmount': 'sum', 'Year': 'first', 'Unit': 'first' }) # add reliability score fbs['DataReliability'] = weighted_average(df, 'ReliabilityScore', 'FlowAmount', grouping_vars) fbs.reset_index(inplace=True) # apply flow mapping fbs = map_elementary_flows(fbs, inventory_list) # rename columns to match flowbysector format fbs = fbs.rename(columns={"NAICS_lvl": "SectorProducedBy"}) # add hardcoded data, depending on the source data, some of these fields may need to change fbs['Class'] = 'Chemicals' fbs['SectorConsumedBy'] = 'None' fbs['SectorSourceName'] = 'NAICS_2012_Code' fbs['FlowType'] = 'ELEMENTARY_FLOW' fbs = assign_fips_location_system(fbs, list(inventory_dict.values())[0]) # add missing flow by sector fields fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) # sort dataframe and reset index fbs = fbs.sort_values(list( flow_by_sector_fields.keys())).reset_index(drop=True) return fbs