def collapse_fbs_sectors(fbs): """ Collapses the Sector Produced/Consumed into a single column named "Sector" uses based on identified rules for flowtypes :param fbs: df, a standard FlowBySector (format) :return: df, FBS with single Sector column """ # ensure correct datatypes and order fbs = clean_df(fbs, flow_by_sector_fields, fbs_fill_na_dict) # collapse the FBS sector columns into one column based on FlowType fbs.loc[fbs["FlowType"] == 'TECHNOSPHERE_FLOW', 'Sector'] = fbs["SectorConsumedBy"] fbs.loc[fbs["FlowType"] == 'WASTE_FLOW', 'Sector'] = fbs["SectorProducedBy"] fbs.loc[(fbs["FlowType"] == 'WASTE_FLOW') & (fbs['SectorProducedBy'].isnull()), 'Sector'] = fbs["SectorConsumedBy"] fbs.loc[(fbs["FlowType"] == 'ELEMENTARY_FLOW') & (fbs['SectorProducedBy'].isnull()), 'Sector'] = fbs["SectorConsumedBy"] fbs.loc[(fbs["FlowType"] == 'ELEMENTARY_FLOW') & (fbs['SectorConsumedBy'].isnull()), 'Sector'] = fbs["SectorProducedBy"] fbs.loc[(fbs["FlowType"] == 'ELEMENTARY_FLOW') & (fbs['SectorConsumedBy'].isin(['F010', 'F0100', 'F01000'])) & (fbs['SectorProducedBy'].isin(['22', '221', '2213', '22131', '221310'])), 'Sector'] = fbs["SectorConsumedBy"] # drop sector consumed/produced by columns fbs_collapsed = fbs.drop(columns=['SectorProducedBy', 'SectorConsumedBy']) # aggregate fbs_collapsed = aggregator(fbs_collapsed, fbs_collapsed_default_grouping_fields) # sort dataframe fbs_collapsed = clean_df(fbs_collapsed, flow_by_sector_collapsed_fields, fbs_collapsed_fill_na_dict) fbs_collapsed = fbs_collapsed.sort_values(['Sector', 'Flowable', 'Context', 'Location']).reset_index(drop=True) return fbs_collapsed
def load_fba_w_standardized_units(datasource, year, **kwargs): """ Standardize how a FBA is loaded for allocation purposes when generating a FBS. Important to immediately convert the df units to standardized units. :param datasource: string, FBA source name :param year: int, year of data :param kwargs: optional parameters include flowclass, geographic_level, download_if_missing, allocation_map_to_flow_list :return: fba df with standardized units """ from flowsa.sectormapping import map_fbs_flows # determine if any addtional parameters required to load a Flow-By-Activity # add parameters to dictionary if exist in method yaml fba_dict = {} if 'flowclass' in kwargs: fba_dict['flowclass'] = kwargs['flowclass'] if 'geographic_level' in kwargs: fba_dict['geographic_level'] = kwargs['geographic_level'] if 'download_FBA_if_missing' in kwargs: fba_dict['download_FBA_if_missing'] = kwargs['download_FBA_if_missing'] # load the allocation FBA fba = flowsa.getFlowByActivity(datasource, year, **fba_dict).reset_index(drop=True) # convert to standardized units either by mapping to federal # flow list/material flow list or by using function. Mapping will add # context and flowable columns if 'allocation_map_to_flow_list' in kwargs: if kwargs['allocation_map_to_flow_list']: # ensure df loaded correctly/has correct dtypes fba = clean_df(fba, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) fba, mapping_files = map_fbs_flows(fba, datasource, kwargs, keep_fba_columns=True, keep_unmapped_rows=True) else: # ensure df loaded correctly/has correct dtypes fba = clean_df(fba, flow_by_activity_fields, fba_fill_na_dict) fba = standardize_units(fba) else: # ensure df loaded correctly/has correct dtypes fba = clean_df(fba, flow_by_activity_fields, fba_fill_na_dict) fba = standardize_units(fba) return fba
def load_fba_w_standardized_units(datasource, year, **kwargs): """ Standardize how a FBA is loaded for allocation purposes when generating a FBS. Important to immediately convert the df units to standardized units. :param datasource: string, FBA source name :param year: int, year of data :param kwargs: optional parameters include flowclass, geographic_level, and download_if_missing :return: fba df with standardized units """ # determine if any addtional parameters required to load a Flow-By-Activity # add parameters to dictionary if exist in method yaml fba_dict = {} if 'flowclass' in kwargs: fba_dict['flowclass'] = kwargs['flowclass'] if 'geographic_level' in kwargs: fba_dict['geographic_level'] = kwargs['geographic_level'] # load the allocation FBA fba = flowsa.getFlowByActivity(datasource, year, **fba_dict).reset_index(drop=True) # ensure df loaded correctly/has correct dtypes fba = clean_df(fba, flow_by_activity_fields, fba_fill_na_dict) # convert to standardized units fba = standardize_units(fba) return fba
def process_data_frame(*, df, source, year, config): """ Process the given dataframe, cleaning, converting data, and writing the final parquet. This method was written to move code into a shared method, which was necessary to support the processing of a list of dataframes instead of a single dataframe. :param df: df, FBA format :param source: str, source name :param year: str, year :param config: dict, items in method yaml :return: df, FBA format, standardized """ # log that data was retrieved log.info("Retrieved data for %s %s", source, year) # add any missing columns of data and cast to appropriate data type log.info("Add any missing columns and check field datatypes") flow_df = clean_df(df, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # sort df and reset index flow_df = flow_df.sort_values(['Class', 'Location', 'ActivityProducedBy', 'ActivityConsumedBy', 'FlowName', 'Compartment']).reset_index(drop=True) # save as parquet file name_data = set_fba_name(source, year) meta = set_fb_meta(name_data, "FlowByActivity") write_df_to_file(flow_df, paths, meta) write_metadata(source, config, meta, "FlowByActivity", year=year) log.info("FBA generated and saved for %s", name_data) # rename the log file saved to local directory rename_log_file(name_data, meta)
def subset_df_by_geoscale(df, activity_from_scale, activity_to_scale): """ Subset a df by geoscale or agg to create data specified in method yaml :param df: df, FBA format :param activity_from_scale: str, identified geoscale by which to subset or aggregate from ('national', 'state', 'county') :param activity_to_scale: str, identified geoscale by which to subset or aggregate to ('national', 'state', 'county') :return: df, FBA, subset or aggregated to a single geoscale for all rows """ # method of subset dependent on LocationSystem if df['LocationSystem'].str.contains('FIPS').any(): df = df[df['LocationSystem'].str.contains('FIPS')].reset_index(drop=True) # determine 'activity_from_scale' for use in df geoscale subset, by activity modified_from_scale = return_activity_from_scale(df, activity_from_scale) # add 'activity_from_scale' column to df df2 = pd.merge(df, modified_from_scale) # list of unique 'from' geoscales unique_geoscales =\ modified_from_scale['activity_from_scale'].drop_duplicates().values.tolist() if len(unique_geoscales) > 1: log.info('Dataframe has a mix of geographic levels: %s', ', '.join(unique_geoscales)) # to scale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: to_scale = activity_to_scale else: to_scale = activity_from_scale df_subset_list = [] # subset df based on activity 'from' scale for i in unique_geoscales: df3 = df2[df2['activity_from_scale'] == i] # if desired geoscale doesn't exist, aggregate existing data # if df is less aggregated than allocation df, aggregate # fba activity to allocation geoscale if fips_number_key[i] > fips_number_key[to_scale]: log.info("Aggregating subset from %s to %s", i, to_scale) df_sub = agg_by_geoscale(df3, i, to_scale, fba_mapped_default_grouping_fields) # else filter relevant rows else: log.info("Subsetting %s data", i) df_sub = filter_by_geoscale(df3, i) df_subset_list.append(df_sub) df_subset = pd.concat(df_subset_list, ignore_index=True) # only keep cols associated with FBA mapped df_subset = clean_df(df_subset, flow_by_activity_mapped_fields, fba_fill_na_dict, drop_description=False) # right now, the only other location system is for Statistics Canada data else: df_subset = df.copy() return df_subset
def disaggregate_cropland(fba_w_sector, attr, method, year, sector_column): """ In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to create ratios :param fba_w_sector: df, CoA cropland data, FBA format with sector columns :param attr: dictionary, attribute data from method yaml for activity set :param year: str, year of data :param sector_column: str, the sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :param attr: dictionary, attribute data from method yaml for activity set :return: df, CoA cropland data disaggregated """ # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # drop pastureland data crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True) # drop sectors < 4 digits crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index( drop=True) # create tmp location crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2])) # load the relevant state level harvested cropland by naics naics = load_fba_w_standardized_units(datasource="USDA_CoA_Cropland_NAICS", year=year, flowclass='Land') # subset the harvested cropland by naics naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True) # drop the activities that include '&' naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index( drop=True) # add sectors naics = add_sectors_to_flowbyactivity( naics, sectorsourcename=method['target_sector_source']) # estimate suppressed data by equally allocating parent to child naics naics = estimate_suppressed_data(naics, 'SectorConsumedBy', 3, 'USDA_CoA_Cropland_NAICS') # add missing fbs fields naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict) # aggregate sectors to create any missing naics levels group_cols = fbs_default_grouping_fields # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')] # group_cols.append(sector_column) naics2 = sector_aggregation(naics, group_cols) # add missing naics5/6 when only one naics5/6 associated with a naics4 naics3 = sector_disaggregation(naics2) # drop rows where FlowAmount 0 # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))] naics3 = naics3.loc[naics3['FlowAmount'] != 0] # create ratios naics4 = sector_ratios(naics3, sector_column) # create temporary sector column to match the two dfs on naics4 = naics4.assign( Location_tmp=naics4['Location'].apply(lambda x: x[0:2])) # tmp drop Nonetypes naics4 = replace_NoneType_with_empty_cells(naics4) # check units in prep for merge compare_df_units(crop, naics4) # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate for i in range(4, 6): # subset df to sectors with length = i and length = i + 1 crop_subset = crop.loc[crop[sector_column].apply( lambda x: i + 1 >= len(x) >= i)] crop_subset = crop_subset.assign( Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i])) # if duplicates drop all rows df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True) # drop sector temp column df = df.drop(columns=["Sector_tmp"]) # subset df to keep the sectors of length i df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)] # subset the naics df where naics length is i + 1 naics_subset = \ naics4.loc[naics4[sector_column].apply(lambda x: len(x) == i + 1)].reset_index(drop=True) naics_subset = naics_subset.assign( Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i])) # merge the two df based on locations df_subset = pd.merge(df_subset, naics_subset[[ sector_column, 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp' ]], how='left', left_on=[sector_column, 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp']) # create flow amounts for the new NAICS based on the flow ratio df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[ 'FlowAmountRatio'] # drop rows of 0 and na df_subset = df_subset[df_subset['FlowAmount'] != 0] df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index( drop=True) # drop columns df_subset = df_subset.drop( columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp']) # rename columns df_subset = df_subset.rename( columns={sector_column + '_y': sector_column}) # tmp drop Nonetypes df_subset = replace_NoneType_with_empty_cells(df_subset) # add new rows of data to crop df crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True) # clean up df crop = crop.drop(columns=['Location_tmp']) # equally allocate any further missing naics crop = allocate_dropped_sector_data(crop, 'NAICS_6') # pasture data pasture = \ fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) # concat crop and pasture fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def main(**kwargs): """ Creates a flowbysector dataset :param kwargs: dictionary of arguments, only argument is "method_name", the name of method corresponding to flowbysector method yaml name :return: parquet, FBS save to local folder """ if len(kwargs) == 0: kwargs = parse_args() method_name = kwargs['method'] download_FBA_if_missing = kwargs.get('download_FBAs_if_missing') # assign arguments vLog.info("Initiating flowbysector creation for %s", method_name) # call on method method = load_yaml_dict(method_name, flowbytype='FBS') # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbs_list = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v, download_FBA_if_missing) if v['data_format'] == 'FBA': # ensure correct datatypes and that all fields exist flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # clean up fba before mapping, if specified in yaml if "clean_fba_before_mapping_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity", k) flows = dynamically_import_fxn( k, v["clean_fba_before_mapping_df_fxn"])(flows) # map flows to federal flow list or material flow list flows_mapped, mapping_files = \ map_fbs_flows(flows, k, v, keep_fba_columns=True) # clean up fba, if specified in yaml if "clean_fba_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity", k) flows_mapped = dynamically_import_fxn( k, v["clean_fba_df_fxn"])(flows_mapped) # if activity_sets are specified in a file, call them here if 'activity_set_file' in v: aset_names = pd.read_csv(flowbysectoractivitysetspath + v['activity_set_file'], dtype=str) else: aset_names = None # master list of activity names read in from data source ml_act = [] # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities if 'activity_set_file' in v: names = \ aset_names[aset_names['activity_set'] == aset]['name'] else: names = attr['names'] # to avoid double counting data from the same source, in # the event there are values in both the APB and ACB # columns, if an activity has already been read in and # allocated, remove that activity from the mapped flows # regardless of what activity set the data was read in flows_mapped = flows_mapped[~( (flows_mapped[fba_activity_fields[0]].isin(ml_act)) | (flows_mapped[fba_activity_fields[1]].isin(ml_act)) )].reset_index(drop=True) ml_act.extend(names) vLog.info("Preparing to handle %s in %s", aset, k) # subset fba data by activity flows_subset = flows_mapped[ (flows_mapped[fba_activity_fields[0]].isin(names)) | (flows_mapped[fba_activity_fields[1]].isin(names) )].reset_index(drop=True) # subset by flowname if exists if 'source_flows' in attr: flows_subset = flows_subset[flows_subset['FlowName'].isin( attr['source_flows'])] if len(flows_subset) == 0: log.warning(f"no data found for flows in {aset}") continue if len(flows_subset[flows_subset['FlowAmount'] != 0]) == 0: log.warning(f"all flow data for {aset} is 0") continue # if activities are sector-like, check sectors are valid if check_activities_sector_like(k): flows_subset2 = replace_naics_w_naics_from_another_year( flows_subset, method['target_sector_source']) # check impact on df FlowAmounts vLog.info( 'Calculate FlowAmount difference caused by ' 'replacing NAICS Codes with %s, saving ' 'difference in Validation log', method['target_sector_source'], ) calculate_flowamount_diff_between_dfs( flows_subset, flows_subset2) else: flows_subset2 = flows_subset.copy() # extract relevant geoscale data or aggregate existing data flows_subset_geo = subset_df_by_geoscale( flows_subset2, v['geoscale_to_use'], attr['allocation_from_scale']) # if loading data subnational geoscale, check for data loss if attr['allocation_from_scale'] != 'national': compare_geographic_totals(flows_subset_geo, flows_mapped, k, attr, aset, names) # Add sectors to df activity, depending on level # of specified sector aggregation log.info("Adding sectors to %s", k) flows_subset_wsec = add_sectors_to_flowbyactivity( flows_subset_geo, sectorsourcename=method['target_sector_source'], allocationmethod=attr['allocation_method']) # clean up fba with sectors, if specified in yaml if "clean_fba_w_sec_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity with sectors", k) flows_subset_wsec = dynamically_import_fxn( k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec, attr=attr, method=method) # rename SourceName to MetaSources and drop columns flows_mapped_wsec = flows_subset_wsec.\ rename(columns={'SourceName': 'MetaSources'}).\ drop(columns=['FlowName', 'Compartment']) # if allocation method is "direct", then no need # to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = direct_allocation_method(flows_mapped_wsec, k, names, method) # if allocation method for an activity set requires a specific # function due to the complicated nature # of the allocation, call on function here elif attr['allocation_method'] == 'allocation_function': fbs = function_allocation_method(flows_mapped_wsec, k, names, attr, fbs_list) else: fbs = dataset_allocation_method(flows_mapped_wsec, attr, names, method, k, v, aset, aset_names, download_FBA_if_missing) # drop rows where flowamount = 0 # (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # define grouping columns dependent on sectors # being activity-like or not if check_activities_sector_like(k) is False: groupingcols = fbs_grouping_fields_w_activities groupingdict = flow_by_sector_fields_w_activity else: groupingcols = fbs_default_grouping_fields groupingdict = flow_by_sector_fields # clean df fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to %s level", method['target_geoscale']) # determine from scale if fips_number_key[v['geoscale_to_use']] <\ fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] fbs_geo_agg = agg_by_geoscale(fbs, from_scale, method['target_geoscale'], groupingcols) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols) # add missing naics5/6 when only one naics5/6 # associated with a naics4 fbs_agg = sector_disaggregation(fbs_sec_agg) # check if any sector information is lost before reaching # the target sector length, if so, # allocate values equally to disaggregated sectors vLog.info( 'Searching for and allocating FlowAmounts for any parent ' 'NAICS that were dropped in the subset to ' '%s child NAICS', method['target_sector_level']) fbs_agg_2 = equally_allocate_parent_to_child_naics( fbs_agg, method['target_sector_level']) # compare flowbysector with flowbyactivity compare_activity_to_sector_flowamounts(flows_mapped_wsec, fbs_agg_2, aset, k, method) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # subset df, necessary because not all of the sectors are # NAICS and can get duplicate rows fbs_1 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\ reset_index(drop=True) fbs_2 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isnull())].\ reset_index(drop=True) fbs_3 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isnull()) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\ reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3]) # drop activity columns fbs_sector_subset = fbs_sector_subset.drop( ['ActivityProducedBy', 'ActivityConsumedBy'], axis=1, errors='ignore') # save comparison of FBA total to FBS total for an activity set compare_fba_geo_subset_and_fbs_output_totals( flows_subset_geo, fbs_sector_subset, aset, k, v, attr, method) log.info("Completed flowbysector for %s", aset) fbs_list.append(fbs_sector_subset) else: if 'clean_fbs_df_fxn' in v: flows = dynamically_import_fxn(v["clean_fbs_df_fxn_source"], v["clean_fbs_df_fxn"])(flows) flows = update_geoscale(flows, method['target_geoscale']) # if the loaded flow dt is already in FBS format, # append directly to list of FBS log.info("Append %s to FBS list", k) # ensure correct field datatypes and add any missing fields flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict) fbs_list.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbs_list, ignore_index=True, sort=False) log.info("Clean final dataframe") # add missing fields, ensure correct data type, # add missing columns, reorder columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) # prior to aggregating, replace MetaSources string with all sources # that share context/flowable/sector values fbss = harmonize_FBS_columns(fbss) # aggregate df as activities might have data for # the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") # ensure correct data types/order of columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # check for negative flow amounts check_for_negative_flowamounts(fbss) # tmp reset data quality scores fbss = reset_fbs_dq_scores(fbss) # save parquet file meta = set_fb_meta(method_name, "FlowBySector") write_df_to_file(fbss, paths, meta) write_metadata(method_name, method, meta, "FlowBySector") # rename the log file saved to local directory rename_log_file(method_name, meta) log.info( 'See the Validation log for detailed assessment of ' 'model results in %s', logoutputpath)