def map_fbs_flows(fbs, from_fba_source, v, **kwargs): """ Identifies the mapping file and applies mapping to fbs flows :param fbs: flow-by-sector dataframe :param from_fba_source: str Source name of fba list to look for mappings :param v: dictionary, The datasource parameters :param kwargs: includes keep_unmapped_columns and keep_fba_columns :return fbs_mapped: df, with flows mapped using federal elementary flow list or material flow list :return mapping_files: str, name of mapping file """ ignore_source_name = False if 'mfl_mapping' in v: mapping_files = v['mfl_mapping'] log.info("Mapping flows in %s to material flow list", from_fba_source) flow_type = 'WASTE_FLOW' ignore_source_name = True else: log.info("Mapping flows in %s to federal elementary flow list", from_fba_source) if 'fedefl_mapping' in v: mapping_files = v['fedefl_mapping'] ignore_source_name = True else: mapping_files = from_fba_source flow_type = 'ELEMENTARY_FLOW' fbs_mapped = map_flows(fbs, mapping_files, flow_type, ignore_source_name, **kwargs) return fbs_mapped, mapping_files
def harmonize_FBS_columns(df): """ For FBS use in USEEIOR, harmonize the values in the columns - LocationSystem: drop the year, so just 'FIPS' - MeasureofSpread: tmp set to NoneType as values currently misleading - Spread: tmp set to 0 as values currently misleading - DistributionType: tmp set to NoneType as values currently misleading - MetaSources: Combine strings for rows where class/context/flowtype/flowable/etc. are equal :param df: FBS dataframe with mixed values/strings in columns :return: FBS df with harmonized values/strings in columns """ # harmonize LocationSystem column log.info('Drop year in LocationSystem') if df['LocationSystem'].str.contains('FIPS').all(): df = df.assign(LocationSystem='FIPS') # harmonize MeasureofSpread log.info('Reset MeasureofSpread to NoneType') df = df.assign(MeasureofSpread=None) # reset spread, as current values are misleading log.info('Reset Spread to 0') df = df.assign(Spread=0) # harmonize Distributiontype log.info('Reset DistributionType to NoneType') df = df.assign(DistributionType=None) # harmonize metasources log.info('Harmonize MetaSources') df = replace_NoneType_with_empty_cells(df) # subset all string cols of the df and drop duplicates string_cols = [ 'Flowable', 'Class', 'SectorProducedBy', 'SectorConsumedBy', 'SectorSourceName', 'Context', 'Location', 'LocationSystem', 'Unit', 'FlowType', 'Year', 'MeasureofSpread', 'MetaSources' ] df_sub = df[string_cols].drop_duplicates().reset_index(drop=True) # sort df df_sub = df_sub.sort_values( ['MetaSources', 'SectorProducedBy', 'SectorConsumedBy']).reset_index(drop=True) # new group cols group_no_meta = [e for e in string_cols if e not in 'MetaSources'] # combine/sum columns that share the same data other than Metasources, # combining MetaSources string in process df_sub = df_sub.groupby(group_no_meta)['MetaSources'].apply( ', '.join).reset_index() # drop the MetaSources col in original df and replace with the # MetaSources col in df_sub df = df.drop(columns='MetaSources') harmonized_df = df.merge(df_sub, how='left') harmonized_df = replace_strings_with_NoneType(harmonized_df) return harmonized_df
def writeFlowBySectorBibliography(methodname): """ Generate bibliography for FlowBySectorMethod in local directory :param methodname: string, FBS methodname for which to create .bib file :return: .bib file save to local directory """ # Generate a single .bib file for a list of Flow-By-Sector method names # and save file to local directory log.info('Write bibliography to %s%s.bib', biboutputpath, methodname) generate_fbs_bibliography(methodname)
def reassign_airplane_emissions(df, year, NAICS_level_value): """ Reassigns emissions from airplanes to NAICS associated with air transportation instead of the NAICS assigned to airports :param df: a dataframe of emissions and mapped faciliites from stewicombo :param year: year as str :param NAICS_level_value: desired NAICS aggregation level, using sector_level_key, should match target_sector_level :return: df """ import stewi from stewicombo.overlaphandler import remove_default_flow_overlaps from stewicombo.globals import addChemicalMatches # subtract emissions for air transportation from airports in NEI airport_NAICS = '4881' air_transportation_SCC = '2275020000' air_transportation_naics = '481111' log.info('Reassigning emissions from air transportation from airports') # obtain and prepare SCC dataset df_airplanes = stewi.getInventory('NEI', year, stewiformat='flowbyprocess') df_airplanes = df_airplanes[df_airplanes['Process'] == air_transportation_SCC] df_airplanes['Source'] = 'NEI' df_airplanes = addChemicalMatches(df_airplanes) df_airplanes = remove_default_flow_overlaps(df_airplanes, SCC=True) df_airplanes.drop(columns=['Process'], inplace=True) facility_mapping_air = df[['FacilityID', 'NAICS']] facility_mapping_air.drop_duplicates(keep='first', inplace=True) df_airplanes = df_airplanes.merge(facility_mapping_air, how='left', on='FacilityID') df_airplanes['Year'] = year df_airplanes = df_airplanes[df_airplanes['NAICS'].str[0:len(airport_NAICS)] == airport_NAICS] # subtract airplane emissions from airport NAICS at individual facilities df_planeemissions = df_airplanes[['FacilityID', 'FlowName', 'FlowAmount']] df_planeemissions.rename(columns={'FlowAmount': 'PlaneEmissions'}, inplace=True) df = df.merge(df_planeemissions, how='left', on=['FacilityID', 'FlowName']) df[['PlaneEmissions']] = df[['PlaneEmissions']].fillna(value=0) df['FlowAmount'] = df['FlowAmount'] - df['PlaneEmissions'] df.drop(columns=['PlaneEmissions'], inplace=True) # add airplane emissions under air transport NAICS df_airplanes.loc[:, 'NAICS_lvl'] = \ air_transportation_naics[0:NAICS_level_value] df = pd.concat([df, df_airplanes], ignore_index=True) return df
def generate_list_of_sources_in_fbs_method(methodname): """ Determine what FlowByActivities are used to generate a FlowBySector :param methodname: string, FlowBySector method :return: list, pairs of FlowByActivity source names and years """ sources = [] # load the fbs method yaml fbs_yaml = load_yaml_dict(methodname, flowbytype='FBS') # create list of data and allocation data sets fbs = fbs_yaml['source_names'] for fbs_k, fbs_v in fbs.items(): try: sources.append([fbs_k, fbs_v['year']]) except KeyError: log.info( 'Could not append %s to datasource ' 'list because missing year', fbs_k) continue activities = fbs_v['activity_sets'] for aset, attr in activities.items(): if attr['allocation_source'] != 'None': sources.append([ attr['allocation_source'], attr['allocation_source_year'] ]) if 'helper_source' in attr: sources.append( [attr['helper_source'], attr['helper_source_year']]) if 'literature_sources' in attr: for source, date in attr['literature_sources'].items(): sources.append([source, date]) # load any additional fbas that are called in a fbs method within fxns try: fbas = load_fbs_methods_additional_fbas_config()[methodname] for s, acts_info in fbas.items(): for acts, fxn_info in acts_info.items(): for fxn, fba_info in fxn_info.items(): for fba, y in fba_info.items(): fxn_config = \ load_functions_loading_fbas_config()[fxn][fba] sources.append([fxn_config['source'], y]) except KeyError: # if no additional fbas than pass log.info( f'There are no additional Flow-By-Activities ' 'used in generating %s', methodname) pass return sources
def compare_df_units(df1_load, df2_load): """ Determine what units are in each df prior to merge :param df1_load: :param df2_load: :return: """ df1 = df1_load['Unit'].drop_duplicates().tolist() df2 = df2_load['Unit'].drop_duplicates().tolist() # identify differnces between unit lists list_comp = list(set(df1) ^ set(df2)) # if list is not empty, print warning that units are different if list_comp: log.info('Merging df with %s and df with %s units', df1, df2)
def usgs_myb_year(years, current_year_str): """ Sets the column for the string based on the year. Checks that the year you picked is in the last file. :param years: string, with hypthon :param current_year_str: string, year of interest :return: string, year """ years_array = years.split("-") lower_year = int(years_array[0]) upper_year = int(years_array[1]) current_year = int(current_year_str) if lower_year <= current_year <= upper_year: column_val = current_year - lower_year + 1 return "year_" + str(column_val) else: log.info("Your year is out of scope. Pick a year between %s and %s", lower_year, upper_year)
def function_allocation_method(flow_subset_mapped, k, names, attr, fbs_list): """ Allocate df activities to sectors using a function identified in the FBS method yaml :param flow_subset_mapped: df, FBA with flows converted using fedelemflowlist :param k: str, source name :param names: list, activity names in activity set :param attr: dictionary, attribute data from method yaml for activity set :param fbs_list: list, fbs dfs created running flowbysector.py :return: df, FBS, with allocated activity columns to sectors """ log.info( 'Calling on function specified in method yaml to allocate ' '%s to sectors', ', '.join(map(str, names))) fbs = dynamically_import_fxn(k, attr['allocation_source'])(flow_subset_mapped, attr, fbs_list) return fbs
def direct_allocation_method(fbs, k, names, method): """ Directly assign activities to sectors :param fbs: df, FBA with flows converted using fedelemflowlist :param k: str, source name :param names: list, activity names in activity set :param method: dictionary, FBS method yaml :return: df with sector columns """ log.info('Directly assigning activities to sectors') # for each activity, if activities are not sector like, # check that there is no data loss if check_activities_sector_like(k) is False: activity_list = [] n_allocated = [] for n in names: # avoid double counting by dropping n from the df after calling on # n, in the event both ACB and APB values exist fbs = fbs[~( (fbs[fba_activity_fields[0]].isin(n_allocated)) | (fbs[fba_activity_fields[1]].isin(n_allocated)))].reset_index( drop=True) log.debug('Checking for %s at %s', n, method['target_sector_level']) fbs_subset = \ fbs[(fbs[fba_activity_fields[0]] == n) | (fbs[fba_activity_fields[1]] == n)].reset_index(drop=True) # check if an Activity maps to more than one sector, # if so, equally allocate fbs_subset = equal_allocation(fbs_subset) fbs_subset = equally_allocate_parent_to_child_naics( fbs_subset, method['target_sector_level']) activity_list.append(fbs_subset) n_allocated.append(n) fbs = pd.concat(activity_list, ignore_index=True) return fbs
def getFlowByActivity(datasource, year, flowclass=None, geographic_level=None, download_FBA_if_missing=DEFAULT_DOWNLOAD_IF_MISSING): """ Retrieves stored data in the FlowByActivity format :param datasource: str, the code of the datasource. :param year: int, a year, e.g. 2012 :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water' :param geographic_level: str, a geographic level of the data. Optional. E.g. 'national', 'state', 'county'. :param download_FBA_if_missing: bool, if True will attempt to load from remote server prior to generating if file not found locally :return: a pandas DataFrame in FlowByActivity format """ # Set fba metadata name = flowsa.flowbyactivity.set_fba_name(datasource, year) fba_meta = set_fb_meta(name, "FlowByActivity") # Try to load a local version of FBA fba = load_preprocessed_output(fba_meta, paths) # If that didn't work, try to download a remote version of FBA if fba is None and download_FBA_if_missing: log.info('%s %s not found in %s, downloading from remote source', datasource, str(year), fbaoutputpath) download_from_remote(fba_meta, paths) fba = load_preprocessed_output(fba_meta, paths) # If that didn't work or wasn't allowed, try to construct the FBA if fba is None: log.info('%s %s not found in %s, running functions to generate FBA', datasource, str(year), fbaoutputpath) # Generate the fba flowsa.flowbyactivity.main(year=year, source=datasource) # Now load the fba fba = load_preprocessed_output(fba_meta, paths) # If none of the above worked, log an error message if fba is None: log.error('getFlowByActivity failed, FBA not found') # Otherwise (that is, if one of the above methods successfuly loaded the # FBA), log it. else: log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath) # Address optional parameters if flowclass is not None: fba = fba[fba['Class'] == flowclass] # if geographic level specified, only load rows in geo level if geographic_level is not None: fba = filter_by_geoscale(fba, geographic_level) return fba
def getFlowBySector(methodname, download_FBAs_if_missing=DEFAULT_DOWNLOAD_IF_MISSING, download_FBS_if_missing=DEFAULT_DOWNLOAD_IF_MISSING): """ Loads stored FlowBySector output or generates it if it doesn't exist, then loads :param methodname: string, Name of an available method for the given class :param download_FBAs_if_missing: bool, if True will attempt to load FBAS used in generating the FBS from remote server prior to generating if file not found locally :param download_FBS_if_missing: bool, if True will attempt to load from remote server prior to generating if file not found locally :return: dataframe in flow by sector format """ fbs_meta = set_fb_meta(methodname, "FlowBySector") # Try to load a local version of the FBS fbs = load_preprocessed_output(fbs_meta, paths) # If that didn't work, try to download a remote version of FBS if fbs is None and download_FBS_if_missing: log.info('%s not found in %s, downloading from remote source', methodname, fbsoutputpath) # download and load the FBS parquet subdirectory_dict = {'.log': 'Log'} download_from_remote(fbs_meta, paths, subdirectory_dict=subdirectory_dict) fbs = load_preprocessed_output(fbs_meta, paths) # If that didn't work or wasn't allowed, try to construct the FBS if fbs is None: log.info('%s not found in %s, running functions to generate FBS', methodname, fbsoutputpath) # Generate the fbs, with option to download any required FBAs from # Data Commons flowsa.flowbysector.main( method=methodname, download_FBAs_if_missing=download_FBAs_if_missing) # Now load the fbs fbs = load_preprocessed_output(fbs_meta, paths) # If none of the above worked, log an error message if fbs is None: log.error('getFlowBySector failed, FBS not found') # Otherwise (that is, if one of the above methods successfuly loaded the # FBS), log it. else: log.info('Loaded %s from %s', methodname, fbsoutputpath) return fbs
def check_for_missing_sector_data(df, target_sector_level): """ Modeled after validation.py check_if_losing_sector_data Allocates flow amount equally across child NAICS when parent NAICS is not target_level :param df: df :param target_sector_level: str, final sector level of FBS (ex. NAICS_6) :return: df with missing sector level data """ from flowsa.dataclean import replace_NoneType_with_empty_cells from flowsa.dataclean import replace_strings_with_NoneType # temporarily replace null values with empty cells df = replace_NoneType_with_empty_cells(df) activity_field = "SectorProducedBy" rows_lost = pd.DataFrame() cw_load = load_crosswalk('sector_length') for i in range(3, sector_level_key[target_sector_level]): # create df of i length df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)] # import cw and subset to current sector length and # target sector length nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count') # merge df & replace sector produced columns df_x = pd.merge(df_subset, cw, how='left', left_on=[activity_field], right_on=[nlength]) df_x[activity_field] = df_x[target_sector_level] df_x = df_x.drop(columns=[nlength, target_sector_level]) # calculate new flow amounts, based on sector count, # allocating equally to the new sector length codes df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count'] df_x = df_x.drop(columns=['sector_count']) # replace null values with empty cells df_x = replace_NoneType_with_empty_cells(df_x) # append to df sector_list = df_subset[activity_field].drop_duplicates() if len(df_x) != 0: log.warning( 'Data found at %s digit NAICS to be allocated: ' '{}'.format(' '.join(map(str, sector_list))), str(i)) rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True) if len(rows_lost) == 0: log.info('No data loss from NAICS in dataframe') else: log.info('Allocating FlowAmounts equally to each %s', target_sector_level) # add rows of missing data to the fbs sector subset df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_allocated = df_allocated.loc[df_allocated[activity_field].apply( lambda x: len(x) == sector_level_key[target_sector_level])] df_allocated.reset_index(inplace=True) # replace empty cells with NoneType (if dtype is object) df_allocated = replace_strings_with_NoneType(df_allocated) return df_allocated
def generate_fbs_bibliography(methodname): """ Generate bibliography for a FlowBySector :param methodname: string, methodname to create a bibliiography :return: a .bib file saved in local directory """ from flowsa.metadata import getMetadata # create list of sources in method sources = generate_list_of_sources_in_fbs_method(methodname) # loop through list of sources, load source method # yaml, and create bib entry bib_list = [] source_set = set() for source in sources: # drop list duplicates and any where year is None (because allocation # is a function, not a datasource) if source[1] != 'None': try: config = \ load_values_from_literature_citations_config()[source[0]] except KeyError: try: config = getMetadata(source[0], source[1]) # flatten the dictionary so can treat all # dictionaries the same when pulling info config = pd.json_normalize(config, sep='_') config.columns = \ config.columns.str.replace('tool_meta_', '') config = config.to_dict(orient='records')[0] except KeyError or AttributeError: log.info('Could not find metadata for %s', source[0]) continue if config is not None: # ensure data sources are not duplicated # when different source names try: if (config['source_name'], config['author'], source[1], config['source_url']) not in source_set: source_set.add( (config['source_name'], config['author'], source[1], config['source_url'])) # if there is a date downloaded, use in # citation over date generated if 'original_data_download_date' in config: bib_date = config['original_data_download_date'] elif 'date_accessed' in config: bib_date = config['date_accessed'] else: bib_date = config['date_created'] db = BibDatabase() db.entries = [{ 'title': f"{config['source_name']} " f"{str(source[1])}", 'author': config['author'], 'year': str(source[1]), 'url': config['source_url'], 'urldate': bib_date, 'ID': config['bib_id'] + '_' + str(source[1]), 'ENTRYTYPE': 'misc' }] # append each entry to a list of BibDatabase entries bib_list.append(db) except KeyError: log.exception( 'Missing information needed to ' 'create bib for %s, %s', source[0], source[1]) continue # write out bibliography writer = BibTexWriter() # create directory if missing os.makedirs(outputpath + '/Bibliography', exist_ok=True) with open(f'{biboutputpath}{methodname}.bib', 'w') as bibfile: # loop through all entries in bib_list for b in bib_list: bibfile.write(writer.write(b))
def subset_df_by_geoscale(df, activity_from_scale, activity_to_scale): """ Subset a df by geoscale or agg to create data specified in method yaml :param df: df, FBA format :param activity_from_scale: str, identified geoscale by which to subset or aggregate from ('national', 'state', 'county') :param activity_to_scale: str, identified geoscale by which to subset or aggregate to ('national', 'state', 'county') :return: df, FBA, subset or aggregated to a single geoscale for all rows """ # detect grouping cols by columns if 'Context' in df.columns: groupbycols = fba_mapped_default_grouping_fields cols_to_keep = flow_by_activity_mapped_fields else: groupbycols = fba_default_grouping_fields cols_to_keep = flow_by_activity_fields # method of subset dependent on LocationSystem if df['LocationSystem'].str.contains('FIPS').any(): df = df[df['LocationSystem'].str.contains('FIPS')].reset_index( drop=True) # determine 'activity_from_scale' for use in df # geoscale subset, by activity modified_from_scale = \ return_activity_from_scale(df, activity_from_scale) # add 'activity_from_scale' column to df df2 = pd.merge(df, modified_from_scale) # list of unique 'from' geoscales unique_geoscales = modified_from_scale[ 'activity_from_scale'].drop_duplicates().values.tolist() if len(unique_geoscales) > 1: log.info('Dataframe has a mix of geographic levels: %s', ', '.join(unique_geoscales)) # to scale if fips_number_key[activity_from_scale] > \ fips_number_key[activity_to_scale]: to_scale = activity_to_scale else: to_scale = activity_from_scale df_subset_list = [] # subset df based on activity 'from' scale for i in unique_geoscales: df3 = df2[df2['activity_from_scale'] == i] # if desired geoscale doesn't exist, aggregate existing data # if df is less aggregated than allocation df, aggregate # fba activity to allocation geoscale if fips_number_key[i] > fips_number_key[to_scale]: log.info("Aggregating subset from %s to %s", i, to_scale) df_sub = agg_by_geoscale(df3, i, to_scale, groupbycols) # else filter relevant rows else: log.info("Subsetting %s data", i) df_sub = filter_by_geoscale(df3, i) df_subset_list.append(df_sub) df_subset = pd.concat(df_subset_list, ignore_index=True) # drop unused columns df_subset = clean_df(df_subset, cols_to_keep, fba_fill_na_dict, drop_description=False) return df_subset # right now, the only other location system is for Statistics Canada data else: return df
def dataset_allocation_method(flow_subset_mapped, attr, names, method, k, v, aset, aset_names, download_FBA_if_missing): """ Method of allocation using a specified data source :param flow_subset_mapped: FBA subset mapped using federal elementary flow list :param attr: dictionary, attribute data from method yaml for activity set :param names: list, activity names in activity set :param method: dictionary, FBS method yaml :param k: str, the datasource name :param v: dictionary, the datasource parameters :param aset: dictionary items for FBS method yaml :param aset_names: list, activity set names :param download_FBA_if_missing: bool, indicate if missing FBAs should be downloaded from Data Commons :return: df, allocated activity names """ from flowsa.validation import compare_df_units # add parameters to dictionary if exist in method yaml fba_dict = {} if 'allocation_flow' in attr: fba_dict['flowname_subset'] = attr['allocation_flow'] if 'allocation_compartment' in attr: fba_dict['compartment_subset'] = attr['allocation_compartment'] if 'clean_allocation_fba' in attr: fba_dict['clean_fba'] = attr['clean_allocation_fba'] if 'clean_allocation_fba_w_sec' in attr: fba_dict['clean_fba_w_sec'] = attr['clean_allocation_fba_w_sec'] # load the allocation FBA fba_allocation_wsec = \ load_map_clean_fba(method, attr, fba_sourcename=attr['allocation_source'], df_year=attr['allocation_source_year'], flowclass=attr['allocation_source_class'], geoscale_from=attr['allocation_from_scale'], geoscale_to=v['geoscale_to_use'], download_FBA_if_missing=download_FBA_if_missing, **fba_dict) # subset fba datasets to only keep the sectors associated # with activity subset log.info("Subsetting %s for sectors in %s", attr['allocation_source'], k) fba_allocation_subset = \ get_fba_allocation_subset(fba_allocation_wsec, k, names, flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) # if there is an allocation helper dataset, modify allocation df if 'helper_source' in attr: log.info("Using the specified allocation help for subset of %s", attr['allocation_source']) fba_allocation_subset = \ allocation_helper(fba_allocation_subset, attr, method, v, download_FBA_if_missing=download_FBA_if_missing) # create flow allocation ratios for each activity flow_alloc_list = [] if 'Context' in fba_allocation_subset.columns: group_cols = fba_mapped_wsec_default_grouping_fields else: group_cols = fba_wsec_default_grouping_fields group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] n_allocated = [] for n in names: log.debug("Creating allocation ratios for %s", n) # if n has already been called, drop all rows of data # containing n to avoid double counting when there are two # activities in each ACB and APB columns fba_allocation_subset = fba_allocation_subset[~( (fba_allocation_subset[fba_activity_fields[0]].isin(n_allocated)) | (fba_allocation_subset[fba_activity_fields[1]].isin(n_allocated)) )].reset_index(drop=True) fba_allocation_subset_2 = \ get_fba_allocation_subset(fba_allocation_subset, k, [n], flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method'], activity_set_names=aset_names) if len(fba_allocation_subset_2) == 0: log.info("No data found to allocate %s", n) else: flow_alloc = \ allocate_by_sector(fba_allocation_subset_2, attr, attr['allocation_method'], group_cols, flowSubsetMapped=flow_subset_mapped) flow_alloc = flow_alloc.assign(FBA_Activity=n) n_allocated.append(n) flow_alloc_list.append(flow_alloc) flow_allocation = pd.concat(flow_alloc_list, ignore_index=True) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of %s", attr['allocation_source']) flow_allocation = collapse_activity_fields(flow_allocation) # check for issues with allocation ratios check_allocation_ratios(flow_allocation, aset, method, attr) # create list of sectors in the flow allocation df, # drop any rows of data in the flow df that aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity # list, based on overlapping sectors flow_subset_mapped = flow_subset_mapped.loc[ (flow_subset_mapped[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_mapped[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation " "dataframes use the same location systems") check_if_location_systems_match(flow_subset_mapped, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge %s and subset of %s", k, attr['allocation_source']) for i, j in activity_fields.items(): # check units compare_df_units(flow_subset_mapped, flow_allocation) # create list of columns to merge on if 'allocation_merge_columns' in attr: fa_cols = \ ['Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity'] + \ attr['allocation_merge_columns'] l_cols = \ ['Location', j[1]["flowbysector"], j[0]["flowbyactivity"]] + \ attr['allocation_merge_columns'] r_cols = ['Location', 'Sector', 'FBA_Activity'] + \ attr['allocation_merge_columns'] else: fa_cols = ['Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity'] l_cols = ['Location', j[1]["flowbysector"], j[0]["flowbyactivity"]] r_cols = ['Location', 'Sector', 'FBA_Activity'] flow_subset_mapped = \ flow_subset_mapped.merge(flow_allocation[fa_cols], left_on=l_cols, right_on=r_cols, how='left') # merge the flowamount columns flow_subset_mapped.loc[:, 'FlowAmountRatio'] =\ flow_subset_mapped['FlowAmountRatio_x'].fillna( flow_subset_mapped['FlowAmountRatio_y']) # fill null rows with 0 because no allocation info flow_subset_mapped['FlowAmountRatio'] = \ flow_subset_mapped['FlowAmountRatio'].fillna(0) # drop rows where there is no allocation data fbs = flow_subset_mapped.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=[ 'Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'FBA_Activity_x', 'FBA_Activity_y' ]) return fbs
def reset_fbs_dq_scores(df): """ Set all Data Quality Scores to None :param df: FBS dataframe with mixed values/strings in columns :return: FBS df with the DQ scores set to null """ # reset spread, as current values are misleading log.info('Reset Spread to None') df = df.assign(Spread=None) # reset min, as current values are misleading log.info('Reset Min to None') df = df.assign(Min=None) # reset min, as current values are misleading log.info('Reset Max to None') df = df.assign(Max=None) # reset DR, as current values are misleading log.info('Reset DataReliability to None') df = df.assign(DataReliability=None) # reset TC, as current values are misleading log.info('Reset TemporalCorrelation to None') df = df.assign(TemporalCorrelation=None) # reset GC, as current values are misleading log.info('Reset GeographicalCorrelation to None') df = df.assign(GeographicalCorrelation=None) # reset TC, as current values are misleading log.info('Reset TechnologicalCorrelation to None') df = df.assign(TechnologicalCorrelation=None) # reset DC, as current values are misleading log.info('Reset DataCollection to None') df = df.assign(DataCollection=None) return df
def load_map_clean_fba(method, attr, fba_sourcename, df_year, flowclass, geoscale_from, geoscale_to, **kwargs): """ Load, clean, and map a FlowByActivity df :param method: dictionary, FBS method yaml :param attr: dictionary, attribute data from method yaml for activity set :param fba_sourcename: str, source name :param df_year: str, year :param flowclass: str, flowclass to subset df with :param geoscale_from: str, geoscale to use :param geoscale_to: str, geoscale to aggregate to :param kwargs: dictionary, can include parameters: 'allocation_flow', 'allocation_compartment','clean_allocation_fba', 'clean_allocation_fba_w_sec' :return: df, fba format """ # dictionary to load/standardize fba kwargs_dict = {} if 'download_FBA_if_missing' in kwargs: kwargs_dict['download_FBA_if_missing'] = \ kwargs['download_FBA_if_missing'] if 'allocation_map_to_flow_list' in attr: kwargs_dict['allocation_map_to_flow_list'] = \ attr['allocation_map_to_flow_list'] log.info("Loading allocation flowbyactivity %s for year %s", fba_sourcename, str(df_year)) fba = load_fba_w_standardized_units(datasource=fba_sourcename, year=df_year, flowclass=flowclass, **kwargs_dict) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the %s level", geoscale_from) check_if_data_exists_at_geoscale(fba, geoscale_from) # aggregate geographically to the scale of the flowbyactivty source, # if necessary fba = subset_df_by_geoscale(fba, geoscale_from, geoscale_to) # subset based on yaml settings if 'flowname_subset' in kwargs: if kwargs['flowname_subset'] != 'None': fba = fba.loc[fba['FlowName'].isin(kwargs['flowname_subset'])] if 'compartment_subset' in kwargs: if kwargs['compartment_subset'] != 'None': fba = \ fba.loc[fba['Compartment'].isin(kwargs['compartment_subset'])] # cleanup the fba allocation df, if necessary if 'clean_fba' in kwargs: log.info("Cleaning %s", fba_sourcename) fba = dynamically_import_fxn(fba_sourcename, kwargs["clean_fba"])( fba, attr=attr, download_FBA_if_missing=kwargs['download_FBA_if_missing']) # reset index fba = fba.reset_index(drop=True) # assign sector to allocation dataset log.info("Adding sectors to %s", fba_sourcename) fba_wsec = add_sectors_to_flowbyactivity( fba, sectorsourcename=method['target_sector_source']) # call on fxn to further clean up/disaggregate the fba # allocation data, if exists if 'clean_fba_w_sec' in kwargs: log.info("Further disaggregating sectors in %s", fba_sourcename) fba_wsec = dynamically_import_fxn( fba_sourcename, kwargs['clean_fba_w_sec'])( fba_wsec, attr=attr, method=method, sourcename=fba_sourcename, download_FBA_if_missing=kwargs['download_FBA_if_missing']) return fba_wsec
def stewicombo_to_sector(yaml_load): """ Returns emissions from stewicombo in fbs format, requires stewi >= 0.9.5 :param yaml_load: which may contain the following elements: local_inventory_name: (optional) a string naming the file from which to source a pregenerated stewicombo file stored locally (e.g., 'CAP_HAP_national_2017_v0.9.7_5cf36c0.parquet' or 'CAP_HAP_national_2017') inventory_dict: a dictionary of inventory types and years (e.g., {'NEI':'2017', 'TRI':'2017'}) NAICS_level: desired NAICS aggregation level, using sector_level_key, should match target_sector_level geo_scale: desired geographic aggregation level ('national', 'state', 'county'), should match target_geoscale compartments: list of compartments to include (e.g., 'water', 'air', 'soil'), use None to include all compartments functions: list of functions (str) to call for additional processing :return: df, FBS format """ import stewicombo from flowsa.data_source_scripts.EPA_NEI import drop_GHGs # determine if fxns specified in FBS method yaml if 'functions' not in yaml_load: functions = [] else: functions = yaml_load['functions'] if 'local_inventory_name' in yaml_load: inventory_name = yaml_load['local_inventory_name'] else: inventory_name = None NAICS_level_value = sector_level_key[yaml_load['NAICS_level']] df = None if inventory_name is not None: df = stewicombo.getInventory(inventory_name, True) if df is None: # run stewicombo to combine inventories, filter for LCI, remove overlap log.info('generating inventory in stewicombo') df = stewicombo.combineFullInventories( yaml_load['inventory_dict'], filter_for_LCI=True, remove_overlap=True, compartments=yaml_load['compartments']) if df is None: # Inventories not found for stewicombo, return empty FBS return None df.drop(columns=['SRS_CAS', 'SRS_ID', 'FacilityIDs_Combined'], inplace=True) inventory_list = list(yaml_load['inventory_dict'].keys()) if 'drop_GHGs' in functions: df = drop_GHGs(df) functions.remove('drop_GHGs') facility_mapping = extract_facility_data(yaml_load['inventory_dict']) # use NAICS from facility matcher so drop them here facility_mapping.drop(columns=['NAICS'], inplace=True) # merge dataframes to assign facility information based on facility IDs df = pd.merge(df, facility_mapping, how='left', on='FacilityID') all_NAICS = obtain_NAICS_from_facility_matcher(inventory_list) df = pd.merge(df, all_NAICS, how='left', on=['FRS_ID', 'Source']) # add levelized NAICS code prior to aggregation df['NAICS_lvl'] = df['NAICS'].str[0:NAICS_level_value] if 'reassign_airplane_emissions' in functions: df = reassign_airplane_emissions(df, yaml_load['inventory_dict']['NEI'], NAICS_level_value) functions.remove('reassign_airplane_emissions') df['MetaSources'] = df['Source'] fbs = prepare_stewi_fbs(df, yaml_load['inventory_dict'], yaml_load['NAICS_level'], yaml_load['geo_scale']) for function in functions: fbs = getattr(sys.modules[__name__], function)(fbs) return fbs
def compare_activity_to_sector_flowamounts(fba_load, fbs_load, activity_set, source_name, config): """ Function to compare the loaded flowbyactivity with the final flowbysector by activityname (if exists) to target sector level output, checking for data loss :param fba_load: df, FBA loaded and mapped using FEDEFL :param fbs_load: df, final FBS df :param activity_set: str, activity set :param source_name: str, source name :param config: dictionary, method yaml :return: printout data differences between loaded FBA and FBS output, save results as csv in local directory """ if check_activities_sector_like(source_name): vLog.debug('Not comparing loaded FlowByActivity to FlowBySector ' 'ratios for a dataset with sector-like activities because ' 'if there are modifications to flowamounts for a sector, ' 'then the ratios will be different') else: # subset fba df fba = fba_load[[ 'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fba.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ] fba_agg = aggregator(fba, group_cols) fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True) # subset fbs df fbs = fbs_load[[ 'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType', 'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fbs = replace_NoneType_with_empty_cells(fbs) fbs['ProducedLength'] = fbs['SectorProducedBy'].str.len() fbs['ConsumedLength'] = fbs['SectorConsumedBy'].str.len() fbs['SectorLength'] = fbs[['ProducedLength', 'ConsumedLength']].max(axis=1) fbs.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength' ] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True) # merge compare 1 and compare 2 df_merge = fba_agg.merge(fbs_agg, left_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], right_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], how='left') df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount'] # reorder df_merge = df_merge[[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio' ]] # keep onlyrows of specified sector length comparison = df_merge[df_merge['SectorLength'] == sector_level_key[ config['target_sector_level']]].reset_index(drop=True) tolerance = 0.01 comparison2 = comparison[(comparison['Ratio'] < 1 - tolerance) | (comparison['Ratio'] > 1 + tolerance)] if len(comparison2) > 0: vLog.info( 'There are %s combinations of flowable/context/sector ' 'length where the flowbyactivity to flowbysector ratio ' 'is less than or greater than 1 by %s', len(comparison2), str(tolerance)) # include df subset in the validation log # only print rows where flowamount ratio is less t # han 1 (round flowamountratio) df_v = comparison2[comparison2['Ratio'].apply( lambda x: round(x, 3) < 1)].reset_index(drop=True) # save to validation log log.info( 'Save the comparison of FlowByActivity load ' 'to FlowBySector ratios for %s in validation log', activity_set) # if df not empty, print, if empty, print string if df_v.empty: vLogDetailed.info('Ratios for %s all round to 1', activity_set) else: vLogDetailed.info( 'Comparison of FlowByActivity load to ' 'FlowBySector ratios for %s: ' '\n {}'.format(df_v.to_string()), activity_set)
def check_allocation_ratios(flow_alloc_df_load, activity_set, config, attr): """ Check for issues with the flow allocation ratios :param flow_alloc_df_load: df, includes 'FlowAmountRatio' column :param activity_set: str, activity set :param config: dictionary, method yaml :param attr: dictionary, activity set info :return: print out information regarding allocation ratios, save csv of results to local directory """ # if in the attr dictionary, merge columns are identified, # the merge columns need to be accounted for in the grouping/checking of # allocation ratios if 'allocation_merge_columns' in attr: subset_cols = [ 'FBA_Activity', 'Location', 'SectorLength', 'FlowAmountRatio' ] + attr['allocation_merge_columns'] groupcols = ['FBA_Activity', 'Location', 'SectorLength' ] + attr['allocation_merge_columns'] else: subset_cols = [ 'FBA_Activity', 'Location', 'SectorLength', 'FlowAmountRatio' ] groupcols = ['FBA_Activity', 'Location', 'SectorLength'] # create column of sector lengths flow_alloc_df =\ flow_alloc_df_load.assign( SectorLength=flow_alloc_df_load['Sector'].str.len()) # subset df flow_alloc_df2 = flow_alloc_df[subset_cols] # sum the flow amount ratios by location and sector length flow_alloc_df3 = \ flow_alloc_df2.groupby( groupcols, dropna=False, as_index=False).agg( {"FlowAmountRatio": sum}) # keep only rows of specified sector length flow_alloc_df4 = flow_alloc_df3[ flow_alloc_df3['SectorLength'] == sector_level_key[ config['target_sector_level']]].reset_index(drop=True) # keep data where the flowamountratio is greater than or # less than 1 by 0.005 tolerance = 0.01 flow_alloc_df5 = flow_alloc_df4[ (flow_alloc_df4['FlowAmountRatio'] < 1 - tolerance) | (flow_alloc_df4['FlowAmountRatio'] > 1 + tolerance)] if len(flow_alloc_df5) > 0: vLog.info( 'There are %s instances at a sector length of %s ' 'where the allocation ratio for a location is greater ' 'than or less than 1 by at least %s. See Validation Log', len(flow_alloc_df5), config["target_sector_level"], str(tolerance)) # add to validation log log.info( 'Save the summary table of flow allocation ratios for each ' 'sector length for %s in validation log', activity_set) # if df not empty, print, if empty, print string if flow_alloc_df5.empty: vLogDetailed.info('Flow allocation ratios for %s ' 'all round to 1', activity_set) else: vLogDetailed.info( 'Flow allocation ratios for %s: ' '\n {}'.format(flow_alloc_df5.to_string()), activity_set)
def main(**kwargs): """ Creates a flowbysector dataset :param kwargs: dictionary of arguments, only argument is "method_name", the name of method corresponding to flowbysector method yaml name :return: parquet, FBS save to local folder """ if len(kwargs) == 0: kwargs = parse_args() method_name = kwargs['method'] download_FBA_if_missing = kwargs.get('download_FBAs_if_missing') # assign arguments vLog.info("Initiating flowbysector creation for %s", method_name) # call on method method = load_yaml_dict(method_name, flowbytype='FBS') # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbs_list = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v, download_FBA_if_missing) if v['data_format'] == 'FBA': # ensure correct datatypes and that all fields exist flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # clean up fba before mapping, if specified in yaml if "clean_fba_before_mapping_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity", k) flows = dynamically_import_fxn( k, v["clean_fba_before_mapping_df_fxn"])(flows) # map flows to federal flow list or material flow list flows_mapped, mapping_files = \ map_fbs_flows(flows, k, v, keep_fba_columns=True) # clean up fba, if specified in yaml if "clean_fba_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity", k) flows_mapped = dynamically_import_fxn( k, v["clean_fba_df_fxn"])(flows_mapped) # if activity_sets are specified in a file, call them here if 'activity_set_file' in v: aset_names = pd.read_csv(flowbysectoractivitysetspath + v['activity_set_file'], dtype=str) else: aset_names = None # master list of activity names read in from data source ml_act = [] # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities if 'activity_set_file' in v: names = \ aset_names[aset_names['activity_set'] == aset]['name'] else: names = attr['names'] # to avoid double counting data from the same source, in # the event there are values in both the APB and ACB # columns, if an activity has already been read in and # allocated, remove that activity from the mapped flows # regardless of what activity set the data was read in flows_mapped = flows_mapped[~( (flows_mapped[fba_activity_fields[0]].isin(ml_act)) | (flows_mapped[fba_activity_fields[1]].isin(ml_act)) )].reset_index(drop=True) ml_act.extend(names) vLog.info("Preparing to handle %s in %s", aset, k) # subset fba data by activity flows_subset = flows_mapped[ (flows_mapped[fba_activity_fields[0]].isin(names)) | (flows_mapped[fba_activity_fields[1]].isin(names) )].reset_index(drop=True) # subset by flowname if exists if 'source_flows' in attr: flows_subset = flows_subset[flows_subset['FlowName'].isin( attr['source_flows'])] if len(flows_subset) == 0: log.warning(f"no data found for flows in {aset}") continue if len(flows_subset[flows_subset['FlowAmount'] != 0]) == 0: log.warning(f"all flow data for {aset} is 0") continue # if activities are sector-like, check sectors are valid if check_activities_sector_like(k): flows_subset2 = replace_naics_w_naics_from_another_year( flows_subset, method['target_sector_source']) # check impact on df FlowAmounts vLog.info( 'Calculate FlowAmount difference caused by ' 'replacing NAICS Codes with %s, saving ' 'difference in Validation log', method['target_sector_source'], ) calculate_flowamount_diff_between_dfs( flows_subset, flows_subset2) else: flows_subset2 = flows_subset.copy() # extract relevant geoscale data or aggregate existing data flows_subset_geo = subset_df_by_geoscale( flows_subset2, v['geoscale_to_use'], attr['allocation_from_scale']) # if loading data subnational geoscale, check for data loss if attr['allocation_from_scale'] != 'national': compare_geographic_totals(flows_subset_geo, flows_mapped, k, attr, aset, names) # Add sectors to df activity, depending on level # of specified sector aggregation log.info("Adding sectors to %s", k) flows_subset_wsec = add_sectors_to_flowbyactivity( flows_subset_geo, sectorsourcename=method['target_sector_source'], allocationmethod=attr['allocation_method']) # clean up fba with sectors, if specified in yaml if "clean_fba_w_sec_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity with sectors", k) flows_subset_wsec = dynamically_import_fxn( k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec, attr=attr, method=method) # rename SourceName to MetaSources and drop columns flows_mapped_wsec = flows_subset_wsec.\ rename(columns={'SourceName': 'MetaSources'}).\ drop(columns=['FlowName', 'Compartment']) # if allocation method is "direct", then no need # to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = direct_allocation_method(flows_mapped_wsec, k, names, method) # if allocation method for an activity set requires a specific # function due to the complicated nature # of the allocation, call on function here elif attr['allocation_method'] == 'allocation_function': fbs = function_allocation_method(flows_mapped_wsec, k, names, attr, fbs_list) else: fbs = dataset_allocation_method(flows_mapped_wsec, attr, names, method, k, v, aset, aset_names, download_FBA_if_missing) # drop rows where flowamount = 0 # (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # define grouping columns dependent on sectors # being activity-like or not if check_activities_sector_like(k) is False: groupingcols = fbs_grouping_fields_w_activities groupingdict = flow_by_sector_fields_w_activity else: groupingcols = fbs_default_grouping_fields groupingdict = flow_by_sector_fields # clean df fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to %s level", method['target_geoscale']) # determine from scale if fips_number_key[v['geoscale_to_use']] <\ fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] fbs_geo_agg = agg_by_geoscale(fbs, from_scale, method['target_geoscale'], groupingcols) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols) # add missing naics5/6 when only one naics5/6 # associated with a naics4 fbs_agg = sector_disaggregation(fbs_sec_agg) # check if any sector information is lost before reaching # the target sector length, if so, # allocate values equally to disaggregated sectors vLog.info( 'Searching for and allocating FlowAmounts for any parent ' 'NAICS that were dropped in the subset to ' '%s child NAICS', method['target_sector_level']) fbs_agg_2 = equally_allocate_parent_to_child_naics( fbs_agg, method['target_sector_level']) # compare flowbysector with flowbyactivity compare_activity_to_sector_flowamounts(flows_mapped_wsec, fbs_agg_2, aset, k, method) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # subset df, necessary because not all of the sectors are # NAICS and can get duplicate rows fbs_1 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\ reset_index(drop=True) fbs_2 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isnull())].\ reset_index(drop=True) fbs_3 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isnull()) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\ reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3]) # drop activity columns fbs_sector_subset = fbs_sector_subset.drop( ['ActivityProducedBy', 'ActivityConsumedBy'], axis=1, errors='ignore') # save comparison of FBA total to FBS total for an activity set compare_fba_geo_subset_and_fbs_output_totals( flows_subset_geo, fbs_sector_subset, aset, k, v, attr, method) log.info("Completed flowbysector for %s", aset) fbs_list.append(fbs_sector_subset) else: if 'clean_fbs_df_fxn' in v: flows = dynamically_import_fxn(v["clean_fbs_df_fxn_source"], v["clean_fbs_df_fxn"])(flows) flows = update_geoscale(flows, method['target_geoscale']) # if the loaded flow dt is already in FBS format, # append directly to list of FBS log.info("Append %s to FBS list", k) # ensure correct field datatypes and add any missing fields flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict) fbs_list.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbs_list, ignore_index=True, sort=False) log.info("Clean final dataframe") # add missing fields, ensure correct data type, # add missing columns, reorder columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) # prior to aggregating, replace MetaSources string with all sources # that share context/flowable/sector values fbss = harmonize_FBS_columns(fbss) # aggregate df as activities might have data for # the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") # ensure correct data types/order of columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # check for negative flow amounts check_for_negative_flowamounts(fbss) # tmp reset data quality scores fbss = reset_fbs_dq_scores(fbss) # save parquet file meta = set_fb_meta(method_name, "FlowBySector") write_df_to_file(fbss, paths, meta) write_metadata(method_name, method, meta, "FlowBySector") # rename the log file saved to local directory rename_log_file(method_name, meta) log.info( 'See the Validation log for detailed assessment of ' 'model results in %s', logoutputpath)
def compare_fba_geo_subset_and_fbs_output_totals(fba_load, fbs_load, activity_set, source_name, source_attr, activity_attr, method): """ Function to compare the loaded flowbyactivity total after subsetting by activity and geography with the final flowbysector output total. Not a direct comparison of the loaded FBA because FBAs are modified before being subset by activity for the target sector level :param fba_load: df, FBA loaded, before being mapped :param fbs_load: df, final FBS df at target sector level :param activity_set: str, activity set :param source_name: str, source name :param source_attr: dictionary, attribute data from method yaml for source data :param activity_attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :return: printout data differences between loaded FBA and FBS output totals by location, save results as csv in local directory """ vLog.info('Comparing Flow-By-Activity subset by activity and geography to ' 'the subset Flow-By-Sector FlowAmount total.') # determine from scale if fips_number_key[source_attr['geoscale_to_use']] < \ fips_number_key[activity_attr['allocation_from_scale']]: from_scale = source_attr['geoscale_to_use'] else: from_scale = activity_attr['allocation_from_scale'] # extract relevant geoscale data or aggregate existing data fba = subset_df_by_geoscale(fba_load, from_scale, method['target_geoscale']) if check_activities_sector_like(source_name): # if activities are sector-like, run sector aggregation and then # subset df to only keep NAICS2 fba = fba[[ 'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'Location', 'LocationSystem' ]] # rename the activity cols to sector cols for purposes of aggregation fba = fba.rename( columns={ 'ActivityProducedBy': 'SectorProducedBy', 'ActivityConsumedBy': 'SectorConsumedBy' }) group_cols_agg = [ 'Class', 'Context', 'Unit', 'Location', 'LocationSystem', 'SectorProducedBy', 'SectorConsumedBy' ] fba = sector_aggregation(fba, group_cols_agg) # subset fba to only include NAICS2 fba = replace_NoneType_with_empty_cells(fba) fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2) | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)] # subset/agg dfs col_subset = [ 'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem' ] group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem'] # check units compare_df_units(fba, fbs_load) # fba fba = fba[col_subset] fba_agg = aggregator(fba, group_cols).reset_index(drop=True) fba_agg.rename(columns={ 'FlowAmount': 'FBA_amount', 'Unit': 'FBA_unit' }, inplace=True) # fbs fbs = fbs_load[col_subset] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={ 'FlowAmount': 'FBS_amount', 'Unit': 'FBS_unit' }, inplace=True) try: # merge FBA and FBS totals df_merge = fba_agg.merge(fbs_agg, how='left') df_merge['FlowAmount_difference'] = \ df_merge['FBA_amount'] - df_merge['FBS_amount'] df_merge['Percent_difference'] = \ (df_merge['FlowAmount_difference']/df_merge['FBA_amount']) * 100 # reorder df_merge = df_merge[[ 'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount', 'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference', 'Percent_difference' ]] df_merge = replace_NoneType_with_empty_cells(df_merge) # list of contexts and locations context_list = df_merge[['Context', 'Location']].values.tolist() # loop through the contexts and print results of comparison vLog.info( 'Comparing FBA %s %s subset to FBS results. ' 'Details in Validation Log', activity_set, source_attr['geoscale_to_use']) for i, j in context_list: df_merge_subset = \ df_merge[(df_merge['Context'] == i) & (df_merge['Location'] == j)].reset_index(drop=True) diff_per = df_merge_subset['Percent_difference'][0] if np.isnan(diff_per): vLog.info( 'FlowBySector FlowAmount for %s %s %s ' 'does not exist in the FBS', source_name, activity_set, i) continue # make reporting more manageable if abs(diff_per) > 0.01: diff_per = round(diff_per, 2) else: diff_per = round(diff_per, 6) # diff_units = df_merge_subset['FBS_unit'][0] if diff_per > 0: vLog.info( 'FlowBySector FlowAmount for %s %s %s at %s is %s%% ' 'less than the FlowByActivity FlowAmount', source_name, activity_set, i, j, str(abs(diff_per))) elif diff_per < 0: vLog.info( 'FlowBySector FlowAmount for %s %s %s at %s is %s%% ' 'more than the FlowByActivity FlowAmount', source_name, activity_set, i, j, str(abs(diff_per))) elif diff_per == 0: vLogDetailed.info( 'FlowBySector FlowAmount for ' '%s %s %s at %s is equal to the ' 'FlowByActivity FlowAmount', source_name, activity_set, i, j) # subset the df to include in the validation log # only print rows where the percent difference does not round to 0 df_v = df_merge[df_merge['Percent_difference'].apply( lambda x: round(x, 3) != 0)].reset_index(drop=True) # log output log.info( 'Save the comparison of FlowByActivity load to FlowBySector ' 'total FlowAmounts for %s in validation log file', activity_set) # if df not empty, print, if empty, print string if df_v.empty: vLogDetailed.info('Percent difference for %s all round to 0', activity_set) else: vLogDetailed.info( 'Comparison of FBA load to FBS total ' 'FlowAmounts for %s: ' '\n {}'.format(df_v.to_string()), activity_set) except: vLog.info('Error occurred when comparing total FlowAmounts ' 'for FlowByActivity and FlowBySector')
def allocation_helper(df_w_sector, attr, method, v, download_FBA_if_missing): """ Function to help allocate activity names using secondary df :param df_w_sector: df, includes sector columns :param attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :param v: dictionary, the datasource parameters :param download_FBA_if_missing: bool, indicate if missing FBAs should be downloaded from Data Commons or run locally :return: df, with modified fba allocation values """ from flowsa.validation import compare_df_units # add parameters to dictionary if exist in method yaml fba_dict = {} if 'helper_flow' in attr: fba_dict['flowname_subset'] = attr['helper_flow'] if 'clean_helper_fba' in attr: fba_dict['clean_fba'] = attr['clean_helper_fba'] if 'clean_helper_fba_wsec' in attr: fba_dict['clean_fba_w_sec'] = attr['clean_helper_fba_wsec'] # load the allocation FBA helper_allocation = \ load_map_clean_fba(method, attr, fba_sourcename=attr['helper_source'], df_year=attr['helper_source_year'], flowclass=attr['helper_source_class'], geoscale_from=attr['helper_from_scale'], geoscale_to=v['geoscale_to_use'], download_FBA_if_missing=download_FBA_if_missing, **fba_dict) # run sector disagg to capture any missing lower level naics helper_allocation = sector_disaggregation(helper_allocation) # generalize activity field names to enable link to water withdrawal table helper_allocation = collapse_activity_fields(helper_allocation) # drop any rows not mapped helper_allocation = \ helper_allocation[helper_allocation['Sector'].notnull()] # drop columns helper_allocation = \ helper_allocation.drop(columns=['Activity', 'Min', 'Max']) # rename column helper_allocation = \ helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'}) # determine the df_w_sector column to merge on df_w_sector = replace_strings_with_NoneType(df_w_sector) sec_consumed_list = \ df_w_sector['SectorConsumedBy'].drop_duplicates().values.tolist() sec_produced_list = \ df_w_sector['SectorProducedBy'].drop_duplicates().values.tolist() # if a sector field column is not all 'none', that is the column to merge if all(v is None for v in sec_consumed_list): sector_col_to_merge = 'SectorProducedBy' elif all(v is None for v in sec_produced_list): sector_col_to_merge = 'SectorConsumedBy' else: log.error('There is not a clear sector column to base ' 'merge with helper allocation dataset') # merge allocation df with helper df based on sectors, # depending on geo scales of dfs if (attr['helper_from_scale'] == 'state') and \ (attr['allocation_from_scale'] == 'county'): helper_allocation.loc[:, 'Location_tmp'] = \ helper_allocation['Location'].apply(lambda x: x[0:2]) df_w_sector.loc[:, 'Location_tmp'] = \ df_w_sector['Location'].apply(lambda x: x[0:2]) # merge_columns.append('Location_tmp') compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation =\ df_w_sector.merge( helper_allocation[['Location_tmp', 'Sector', 'HelperFlow']], how='left', left_on=['Location_tmp', sector_col_to_merge], right_on=['Location_tmp', 'Sector']) modified_fba_allocation = \ modified_fba_allocation.drop(columns=['Location_tmp']) elif (attr['helper_from_scale'] == 'national') and \ (attr['allocation_from_scale'] != 'national'): compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation = \ df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']], how='left', left_on=[sector_col_to_merge], right_on=['Sector']) else: compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation =\ df_w_sector.merge( helper_allocation[['Location', 'Sector', 'HelperFlow']], left_on=['Location', sector_col_to_merge], right_on=['Location', 'Sector'], how='left') # load bea codes that sub for naics bea = return_bea_codes_used_as_naics() # replace sector column and helperflow value if the sector column to # merge is in the bea list to prevent dropped data modified_fba_allocation['Sector'] = \ np.where(modified_fba_allocation[sector_col_to_merge].isin(bea), modified_fba_allocation[sector_col_to_merge], modified_fba_allocation['Sector']) modified_fba_allocation['HelperFlow'] = \ np.where(modified_fba_allocation[sector_col_to_merge].isin(bea), modified_fba_allocation['FlowAmount'], modified_fba_allocation['HelperFlow']) # modify flow amounts using helper data if 'multiplication' in attr['helper_method']: # if missing values (na or 0), replace with national level values replacement_values =\ helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index(drop=True) replacement_values = \ replacement_values.rename( columns={"HelperFlow": 'ReplacementValue'}) compare_df_units(modified_fba_allocation, replacement_values) modified_fba_allocation = modified_fba_allocation.merge( replacement_values[['Sector', 'ReplacementValue']], how='left') modified_fba_allocation.loc[:, 'HelperFlow'] = \ modified_fba_allocation['HelperFlow'].fillna( modified_fba_allocation['ReplacementValue']) modified_fba_allocation.loc[:, 'HelperFlow'] =\ np.where(modified_fba_allocation['HelperFlow'] == 0, modified_fba_allocation['ReplacementValue'], modified_fba_allocation['HelperFlow']) # replace non-existent helper flow values with a 0, # so after multiplying, don't have incorrect value associated with # new unit modified_fba_allocation['HelperFlow'] =\ modified_fba_allocation['HelperFlow'].fillna(value=0) modified_fba_allocation.loc[:, 'FlowAmount'] = \ modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['HelperFlow'] # drop columns modified_fba_allocation =\ modified_fba_allocation.drop( columns=["HelperFlow", 'ReplacementValue', 'Sector']) elif attr['helper_method'] == 'proportional': modified_fba_allocation =\ proportional_allocation_by_location_and_activity( modified_fba_allocation, sector_col_to_merge) modified_fba_allocation['FlowAmountRatio'] =\ modified_fba_allocation['FlowAmountRatio'].fillna(0) modified_fba_allocation.loc[:, 'FlowAmount'] = \ modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['FlowAmountRatio'] modified_fba_allocation =\ modified_fba_allocation.drop( columns=['FlowAmountRatio', 'HelperFlow', 'Sector']) elif attr['helper_method'] == 'proportional-flagged': # calculate denominators based on activity and 'flagged' column modified_fba_allocation =\ modified_fba_allocation.assign( Denominator=modified_fba_allocation.groupby( ['FlowName', 'ActivityConsumedBy', 'Location', 'disaggregate_flag'])['HelperFlow'].transform('sum')) modified_fba_allocation = modified_fba_allocation.assign( FlowAmountRatio=modified_fba_allocation['HelperFlow'] / modified_fba_allocation['Denominator']) modified_fba_allocation =\ modified_fba_allocation.assign( FlowAmount=modified_fba_allocation['FlowAmount'] * modified_fba_allocation['FlowAmountRatio']) modified_fba_allocation =\ modified_fba_allocation.drop( columns=['disaggregate_flag', 'Sector', 'HelperFlow', 'Denominator', 'FlowAmountRatio']) # run sector aggregation modified_fba_allocation = \ sector_aggregation(modified_fba_allocation, fba_wsec_default_grouping_fields) # drop rows of 0 modified_fba_allocation =\ modified_fba_allocation[ modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True) modified_fba_allocation.loc[modified_fba_allocation['Unit'] == 'gal/employee', 'Unit'] = 'gal' # option to scale up fba values if 'scaled' in attr['helper_method']: log.info("Scaling %s to FBA values", attr['helper_source']) modified_fba_allocation = \ dynamically_import_fxn( attr['allocation_source'], attr["scale_helper_results"])( modified_fba_allocation, attr, download_FBA_if_missing=download_FBA_if_missing) return modified_fba_allocation