def load_map_clean_fba(method, attr, fba_sourcename, df_year, flowclass, geoscale_from, geoscale_to, **kwargs): """ Load, clean, and map a FlowByActivity df :param method: dictionary, FBS method yaml :param attr: dictionary, attribute data from method yaml for activity set :param fba_sourcename: str, source name :param df_year: str, year :param flowclass: str, flowclass to subset df with :param geoscale_from: str, geoscale to use :param geoscale_to: str, geoscale to aggregate to :param kwargs: dictionary, can include parameters: 'allocation_flow', 'allocation_compartment','clean_allocation_fba', 'clean_allocation_fba_w_sec' :return: df, fba format """ log.info("Loading allocation flowbyactivity %s for year %s", fba_sourcename, str(df_year)) fba = load_fba_w_standardized_units(datasource=fba_sourcename, year=df_year, flowclass=flowclass) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the %s level", geoscale_from) check_if_data_exists_at_geoscale(fba, geoscale_from) # aggregate geographically to the scale of the flowbyactivty source, if necessary fba = subset_df_by_geoscale(fba, geoscale_from, geoscale_to) # subset based on yaml settings if 'flowname_subset' in kwargs: if kwargs['flowname_subset'] != 'None': fba = fba.loc[fba['FlowName'].isin(kwargs['flowname_subset'])] if 'compartment_subset' in kwargs: if kwargs['compartment_subset'] != 'None': fba = fba.loc[fba['Compartment'].isin(kwargs['compartment_subset'])] # cleanup the fba allocation df, if necessary if 'clean_fba' in kwargs: log.info("Cleaning %s", fba_sourcename) fba = dynamically_import_fxn(fba_sourcename, kwargs["clean_fba"])(fba, attr=attr) # reset index fba = fba.reset_index(drop=True) # assign sector to allocation dataset log.info("Adding sectors to %s", fba_sourcename) fba_wsec = add_sectors_to_flowbyactivity(fba, sectorsourcename=method['target_sector_source']) # call on fxn to further clean up/disaggregate the fba allocation data, if exists if 'clean_fba_w_sec' in kwargs: log.info("Further disaggregating sectors in %s", fba_sourcename) fba_wsec = dynamically_import_fxn(fba_sourcename, kwargs['clean_fba_w_sec'])(fba_wsec, attr=attr, method=method, sourcename=fba_sourcename) return fba_wsec
def load_source_dataframe(k, v): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param k: str, The datasource name :param v: dictionary, The datasource parameters :return: df of identified parquet """ if v['data_format'] == 'FBA': # if yaml specifies a geoscale to load, use parameter to filter dataframe if 'source_fba_load_scale' in v: geo_level = v['source_fba_load_scale'] else: geo_level = None vLog.info("Retrieving flowbyactivity for datasource %s in year %s", k, str(v['year'])) flows_df = flowsa.getFlowByActivity(datasource=k, year=v['year'], flowclass=v['class'], geographic_level=geo_level) elif v['data_format'] == 'FBS': vLog.info("Retrieving flowbysector for datasource %s", k) flows_df = flowsa.getFlowBySector(k) elif v['data_format'] == 'FBS_outside_flowsa': vLog.info("Retrieving flowbysector for datasource %s", k) flows_df = dynamically_import_fxn(k, v["FBS_datapull_fxn"])(v) else: vLog.error( "Data format not specified in method file for datasource %s", k) return flows_df
def function_allocation_method(flow_subset_mapped, k, names, attr, fbs_list): """ Allocate df activities to sectors using a function identified in the FBS method yaml :param flow_subset_mapped: df, FBA with flows converted using fedelemflowlist :param k: str, source name :param names: list, activity names in activity set :param attr: dictionary, attribute data from method yaml for activity set :param fbs_list: list, fbs dfs created running flowbysector.py :return: df, FBS, with allocated activity columns to sectors """ log.info('Calling on function specified in method yaml to allocate ' '%s to sectors', ', '.join(map(str, names))) fbs = dynamically_import_fxn(k, attr['allocation_source'])(flow_subset_mapped, attr, fbs_list) return fbs
def parse_data(dataframe_list, args, config): """ Calls on functions defined in source.py files, as parsing rules are specific to the data source. :param dataframe_list: list, dfs to concat and parse :param args: dictionary, load parameters 'source' and 'year' :param config: dictionary, FBA yaml :return: df, single df formatted to FBA """ # if hasattr(sys.modules[__name__], config["parse_response_fxn"]): if "parse_response_fxn" in config: # dynamically import and call on function df = dynamically_import_fxn(args['source'], config["parse_response_fxn"])( dataframe_list=dataframe_list, args=args) return df
def parse_data(*, df_list, source, year, config): """ Calls on functions defined in source.py files, as parsing rules are specific to the data source. :param df_list: list, dfs to concat and parse :param source: str, data source :param year: str, year :param config: dictionary, FBA yaml :return: df, single df formatted to FBA """ if "parse_response_fxn" in config: # dynamically import and call on function df = dynamically_import_fxn( source, config["parse_response_fxn"])(df_list=df_list, source=source, year=year, config=config) return df
def assemble_urls_for_query(build_url, config, args): """ Calls on helper functions defined in source.py files to replace parts of the url string :param build_url: str, base url :param config: dictionary, FBA yaml :param args: dictionary, load parameters 'source' and 'year' :return: list, urls to call data from """ if "url_replace_fxn" in config: # dynamically import and call on function urls = dynamically_import_fxn( args['source'], config["url_replace_fxn"])(build_url=build_url, config=config, args=args) else: urls = [] urls.append(build_url) return urls
def load_source_dataframe(sourcename, source_dict, download_FBA_if_missing): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param sourcename: str, The datasource name :param source_dict: dictionary, The datasource parameters :param download_FBA_if_missing: Bool, if True will download FBAs from Data Commons. Default is False. :return: df of identified parquet """ if source_dict['data_format'] == 'FBA': # if yaml specifies a geoscale to load, use parameter # to filter dataframe if 'source_fba_load_scale' in source_dict: geo_level = source_dict['source_fba_load_scale'] else: geo_level = None vLog.info("Retrieving Flow-By-Activity for datasource %s in year %s", sourcename, str(source_dict['year'])) flows_df = flowsa.getFlowByActivity( datasource=sourcename, year=source_dict['year'], flowclass=source_dict['class'], geographic_level=geo_level, download_FBA_if_missing=download_FBA_if_missing) elif source_dict['data_format'] == 'FBS': vLog.info("Retrieving flowbysector for datasource %s", sourcename) flows_df = flowsa.getFlowBySector(sourcename) elif source_dict['data_format'] == 'FBS_outside_flowsa': vLog.info("Retrieving flowbysector for datasource %s", sourcename) flows_df = dynamically_import_fxn( sourcename, source_dict["FBS_datapull_fxn"])(source_dict) else: vLog.error( "Data format not specified in method " "file for datasource %s", sourcename) return flows_df
def call_urls(url_list, args, config): """ This method calls all the urls that have been generated. It then calls the processing method to begin processing the returned data. The processing method is specific to the data source, so this function relies on a function in source.py :param url_list: list, urls to call :param args: dictionary, load parameters 'source' and 'year' :param config: dictionary, FBA yaml :return: list, dfs to concat and parse """ # start requests session s = requests.Session() # identify if url request requires cookies set if 'allow_http_request_cookies' in config: set_cookies = 'yes' else: set_cookies = 'no' # create dataframes list by iterating through url list data_frames_list = [] if url_list[0] is not None: for url in url_list: log.info("Calling %s", url) r = make_http_request(url, requests_session=s, set_cookies=set_cookies) if "call_response_fxn" in config: # dynamically import and call on function df = dynamically_import_fxn( args['source'], config["call_response_fxn"])(url=url, r=r, args=args) if isinstance(df, pd.DataFrame): data_frames_list.append(df) elif isinstance(df, list): data_frames_list.extend(df) return data_frames_list
def call_urls(*, url_list, source, year, config): """ This method calls all the urls that have been generated. It then calls the processing method to begin processing the returned data. The processing method is specific to the data source, so this function relies on a function in source.py :param url_list: list, urls to call :param source: str, data source :param year: str, year :param config: dictionary, FBA yaml :return: list, dfs to concat and parse """ # identify if url request requires cookies set set_cookies = config.get('allow_http_request_cookies') confirm_gdrive = config.get('confirm_gdrive') # create dataframes list by iterating through url list data_frames_list = [] if url_list[0] is not None: for url in url_list: log.info("Calling %s", url) resp = make_url_request(url, set_cookies=set_cookies, confirm_gdrive=confirm_gdrive) if "call_response_fxn" in config: # dynamically import and call on function df = dynamically_import_fxn( source, config["call_response_fxn"])(resp=resp, source=source, year=year, config=config, url=url) if isinstance(df, pd.DataFrame): data_frames_list.append(df) elif isinstance(df, list): data_frames_list.extend(df) return data_frames_list
def assemble_urls_for_query(*, source, year, config): """ Calls on helper functions defined in source.py files to replace parts of the url string :param source: str, data source :param year: str, year :param config: dictionary, FBA yaml :return: list, urls to call data from """ # if there are url parameters defined in the yaml, # then build a url, else use "base_url" urlinfo = config['url'] if urlinfo == 'None': return [None] if 'url_params' in urlinfo: params = parse.urlencode(urlinfo['url_params'], safe='=&%', quote_via=parse.quote) build_url = urlinfo['base_url'] + urlinfo['api_path'] + params else: build_url = urlinfo['base_url'] # substitute year from arguments and users api key into the url build_url = build_url.replace("__year__", str(year)) if "__apiKey__" in build_url: userAPIKey = load_api_key(config['api_name']) # (common.py fxn) build_url = build_url.replace("__apiKey__", userAPIKey) if "url_replace_fxn" in config: # dynamically import and call on function urls = dynamically_import_fxn( source, config["url_replace_fxn"])(build_url=build_url, source=source, year=year, config=config) return urls else: return [build_url]
def allocation_helper(df_w_sector, attr, method, v, download_FBA_if_missing): """ Function to help allocate activity names using secondary df :param df_w_sector: df, includes sector columns :param attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :param v: dictionary, the datasource parameters :param download_FBA_if_missing: bool, indicate if missing FBAs should be downloaded from Data Commons or run locally :return: df, with modified fba allocation values """ from flowsa.validation import compare_df_units # add parameters to dictionary if exist in method yaml fba_dict = {} if 'helper_flow' in attr: fba_dict['flowname_subset'] = attr['helper_flow'] if 'clean_helper_fba' in attr: fba_dict['clean_fba'] = attr['clean_helper_fba'] if 'clean_helper_fba_wsec' in attr: fba_dict['clean_fba_w_sec'] = attr['clean_helper_fba_wsec'] # load the allocation FBA helper_allocation = \ load_map_clean_fba(method, attr, fba_sourcename=attr['helper_source'], df_year=attr['helper_source_year'], flowclass=attr['helper_source_class'], geoscale_from=attr['helper_from_scale'], geoscale_to=v['geoscale_to_use'], download_FBA_if_missing=download_FBA_if_missing, **fba_dict) # run sector disagg to capture any missing lower level naics helper_allocation = sector_disaggregation(helper_allocation) # generalize activity field names to enable link to water withdrawal table helper_allocation = collapse_activity_fields(helper_allocation) # drop any rows not mapped helper_allocation = \ helper_allocation[helper_allocation['Sector'].notnull()] # drop columns helper_allocation = \ helper_allocation.drop(columns=['Activity', 'Min', 'Max']) # rename column helper_allocation = \ helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'}) # determine the df_w_sector column to merge on df_w_sector = replace_strings_with_NoneType(df_w_sector) sec_consumed_list = \ df_w_sector['SectorConsumedBy'].drop_duplicates().values.tolist() sec_produced_list = \ df_w_sector['SectorProducedBy'].drop_duplicates().values.tolist() # if a sector field column is not all 'none', that is the column to merge if all(v is None for v in sec_consumed_list): sector_col_to_merge = 'SectorProducedBy' elif all(v is None for v in sec_produced_list): sector_col_to_merge = 'SectorConsumedBy' else: log.error('There is not a clear sector column to base ' 'merge with helper allocation dataset') # merge allocation df with helper df based on sectors, # depending on geo scales of dfs if (attr['helper_from_scale'] == 'state') and \ (attr['allocation_from_scale'] == 'county'): helper_allocation.loc[:, 'Location_tmp'] = \ helper_allocation['Location'].apply(lambda x: x[0:2]) df_w_sector.loc[:, 'Location_tmp'] = \ df_w_sector['Location'].apply(lambda x: x[0:2]) # merge_columns.append('Location_tmp') compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation =\ df_w_sector.merge( helper_allocation[['Location_tmp', 'Sector', 'HelperFlow']], how='left', left_on=['Location_tmp', sector_col_to_merge], right_on=['Location_tmp', 'Sector']) modified_fba_allocation = \ modified_fba_allocation.drop(columns=['Location_tmp']) elif (attr['helper_from_scale'] == 'national') and \ (attr['allocation_from_scale'] != 'national'): compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation = \ df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']], how='left', left_on=[sector_col_to_merge], right_on=['Sector']) else: compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation =\ df_w_sector.merge( helper_allocation[['Location', 'Sector', 'HelperFlow']], left_on=['Location', sector_col_to_merge], right_on=['Location', 'Sector'], how='left') # load bea codes that sub for naics bea = return_bea_codes_used_as_naics() # replace sector column and helperflow value if the sector column to # merge is in the bea list to prevent dropped data modified_fba_allocation['Sector'] = \ np.where(modified_fba_allocation[sector_col_to_merge].isin(bea), modified_fba_allocation[sector_col_to_merge], modified_fba_allocation['Sector']) modified_fba_allocation['HelperFlow'] = \ np.where(modified_fba_allocation[sector_col_to_merge].isin(bea), modified_fba_allocation['FlowAmount'], modified_fba_allocation['HelperFlow']) # modify flow amounts using helper data if 'multiplication' in attr['helper_method']: # if missing values (na or 0), replace with national level values replacement_values =\ helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index(drop=True) replacement_values = \ replacement_values.rename( columns={"HelperFlow": 'ReplacementValue'}) compare_df_units(modified_fba_allocation, replacement_values) modified_fba_allocation = modified_fba_allocation.merge( replacement_values[['Sector', 'ReplacementValue']], how='left') modified_fba_allocation.loc[:, 'HelperFlow'] = \ modified_fba_allocation['HelperFlow'].fillna( modified_fba_allocation['ReplacementValue']) modified_fba_allocation.loc[:, 'HelperFlow'] =\ np.where(modified_fba_allocation['HelperFlow'] == 0, modified_fba_allocation['ReplacementValue'], modified_fba_allocation['HelperFlow']) # replace non-existent helper flow values with a 0, # so after multiplying, don't have incorrect value associated with # new unit modified_fba_allocation['HelperFlow'] =\ modified_fba_allocation['HelperFlow'].fillna(value=0) modified_fba_allocation.loc[:, 'FlowAmount'] = \ modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['HelperFlow'] # drop columns modified_fba_allocation =\ modified_fba_allocation.drop( columns=["HelperFlow", 'ReplacementValue', 'Sector']) elif attr['helper_method'] == 'proportional': modified_fba_allocation =\ proportional_allocation_by_location_and_activity( modified_fba_allocation, sector_col_to_merge) modified_fba_allocation['FlowAmountRatio'] =\ modified_fba_allocation['FlowAmountRatio'].fillna(0) modified_fba_allocation.loc[:, 'FlowAmount'] = \ modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['FlowAmountRatio'] modified_fba_allocation =\ modified_fba_allocation.drop( columns=['FlowAmountRatio', 'HelperFlow', 'Sector']) elif attr['helper_method'] == 'proportional-flagged': # calculate denominators based on activity and 'flagged' column modified_fba_allocation =\ modified_fba_allocation.assign( Denominator=modified_fba_allocation.groupby( ['FlowName', 'ActivityConsumedBy', 'Location', 'disaggregate_flag'])['HelperFlow'].transform('sum')) modified_fba_allocation = modified_fba_allocation.assign( FlowAmountRatio=modified_fba_allocation['HelperFlow'] / modified_fba_allocation['Denominator']) modified_fba_allocation =\ modified_fba_allocation.assign( FlowAmount=modified_fba_allocation['FlowAmount'] * modified_fba_allocation['FlowAmountRatio']) modified_fba_allocation =\ modified_fba_allocation.drop( columns=['disaggregate_flag', 'Sector', 'HelperFlow', 'Denominator', 'FlowAmountRatio']) # run sector aggregation modified_fba_allocation = \ sector_aggregation(modified_fba_allocation, fba_wsec_default_grouping_fields) # drop rows of 0 modified_fba_allocation =\ modified_fba_allocation[ modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True) modified_fba_allocation.loc[modified_fba_allocation['Unit'] == 'gal/employee', 'Unit'] = 'gal' # option to scale up fba values if 'scaled' in attr['helper_method']: log.info("Scaling %s to FBA values", attr['helper_source']) modified_fba_allocation = \ dynamically_import_fxn( attr['allocation_source'], attr["scale_helper_results"])( modified_fba_allocation, attr, download_FBA_if_missing=download_FBA_if_missing) return modified_fba_allocation
def main(**kwargs): """ Creates a flowbysector dataset :param kwargs: dictionary of arguments, only argument is "method_name", the name of method corresponding to flowbysector method yaml name :return: parquet, FBS save to local folder """ if len(kwargs) == 0: kwargs = parse_args() method_name = kwargs['method'] download_FBA_if_missing = kwargs.get('download_FBAs_if_missing') # assign arguments vLog.info("Initiating flowbysector creation for %s", method_name) # call on method method = load_yaml_dict(method_name, flowbytype='FBS') # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbs_list = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v, download_FBA_if_missing) if v['data_format'] == 'FBA': # ensure correct datatypes and that all fields exist flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # clean up fba before mapping, if specified in yaml if "clean_fba_before_mapping_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity", k) flows = dynamically_import_fxn( k, v["clean_fba_before_mapping_df_fxn"])(flows) # map flows to federal flow list or material flow list flows_mapped, mapping_files = \ map_fbs_flows(flows, k, v, keep_fba_columns=True) # clean up fba, if specified in yaml if "clean_fba_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity", k) flows_mapped = dynamically_import_fxn( k, v["clean_fba_df_fxn"])(flows_mapped) # if activity_sets are specified in a file, call them here if 'activity_set_file' in v: aset_names = pd.read_csv(flowbysectoractivitysetspath + v['activity_set_file'], dtype=str) else: aset_names = None # master list of activity names read in from data source ml_act = [] # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities if 'activity_set_file' in v: names = \ aset_names[aset_names['activity_set'] == aset]['name'] else: names = attr['names'] # to avoid double counting data from the same source, in # the event there are values in both the APB and ACB # columns, if an activity has already been read in and # allocated, remove that activity from the mapped flows # regardless of what activity set the data was read in flows_mapped = flows_mapped[~( (flows_mapped[fba_activity_fields[0]].isin(ml_act)) | (flows_mapped[fba_activity_fields[1]].isin(ml_act)) )].reset_index(drop=True) ml_act.extend(names) vLog.info("Preparing to handle %s in %s", aset, k) # subset fba data by activity flows_subset = flows_mapped[ (flows_mapped[fba_activity_fields[0]].isin(names)) | (flows_mapped[fba_activity_fields[1]].isin(names) )].reset_index(drop=True) # subset by flowname if exists if 'source_flows' in attr: flows_subset = flows_subset[flows_subset['FlowName'].isin( attr['source_flows'])] if len(flows_subset) == 0: log.warning(f"no data found for flows in {aset}") continue if len(flows_subset[flows_subset['FlowAmount'] != 0]) == 0: log.warning(f"all flow data for {aset} is 0") continue # if activities are sector-like, check sectors are valid if check_activities_sector_like(k): flows_subset2 = replace_naics_w_naics_from_another_year( flows_subset, method['target_sector_source']) # check impact on df FlowAmounts vLog.info( 'Calculate FlowAmount difference caused by ' 'replacing NAICS Codes with %s, saving ' 'difference in Validation log', method['target_sector_source'], ) calculate_flowamount_diff_between_dfs( flows_subset, flows_subset2) else: flows_subset2 = flows_subset.copy() # extract relevant geoscale data or aggregate existing data flows_subset_geo = subset_df_by_geoscale( flows_subset2, v['geoscale_to_use'], attr['allocation_from_scale']) # if loading data subnational geoscale, check for data loss if attr['allocation_from_scale'] != 'national': compare_geographic_totals(flows_subset_geo, flows_mapped, k, attr, aset, names) # Add sectors to df activity, depending on level # of specified sector aggregation log.info("Adding sectors to %s", k) flows_subset_wsec = add_sectors_to_flowbyactivity( flows_subset_geo, sectorsourcename=method['target_sector_source'], allocationmethod=attr['allocation_method']) # clean up fba with sectors, if specified in yaml if "clean_fba_w_sec_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity with sectors", k) flows_subset_wsec = dynamically_import_fxn( k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec, attr=attr, method=method) # rename SourceName to MetaSources and drop columns flows_mapped_wsec = flows_subset_wsec.\ rename(columns={'SourceName': 'MetaSources'}).\ drop(columns=['FlowName', 'Compartment']) # if allocation method is "direct", then no need # to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = direct_allocation_method(flows_mapped_wsec, k, names, method) # if allocation method for an activity set requires a specific # function due to the complicated nature # of the allocation, call on function here elif attr['allocation_method'] == 'allocation_function': fbs = function_allocation_method(flows_mapped_wsec, k, names, attr, fbs_list) else: fbs = dataset_allocation_method(flows_mapped_wsec, attr, names, method, k, v, aset, aset_names, download_FBA_if_missing) # drop rows where flowamount = 0 # (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # define grouping columns dependent on sectors # being activity-like or not if check_activities_sector_like(k) is False: groupingcols = fbs_grouping_fields_w_activities groupingdict = flow_by_sector_fields_w_activity else: groupingcols = fbs_default_grouping_fields groupingdict = flow_by_sector_fields # clean df fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to %s level", method['target_geoscale']) # determine from scale if fips_number_key[v['geoscale_to_use']] <\ fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] fbs_geo_agg = agg_by_geoscale(fbs, from_scale, method['target_geoscale'], groupingcols) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols) # add missing naics5/6 when only one naics5/6 # associated with a naics4 fbs_agg = sector_disaggregation(fbs_sec_agg) # check if any sector information is lost before reaching # the target sector length, if so, # allocate values equally to disaggregated sectors vLog.info( 'Searching for and allocating FlowAmounts for any parent ' 'NAICS that were dropped in the subset to ' '%s child NAICS', method['target_sector_level']) fbs_agg_2 = equally_allocate_parent_to_child_naics( fbs_agg, method['target_sector_level']) # compare flowbysector with flowbyactivity compare_activity_to_sector_flowamounts(flows_mapped_wsec, fbs_agg_2, aset, k, method) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # subset df, necessary because not all of the sectors are # NAICS and can get duplicate rows fbs_1 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\ reset_index(drop=True) fbs_2 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isnull())].\ reset_index(drop=True) fbs_3 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isnull()) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\ reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3]) # drop activity columns fbs_sector_subset = fbs_sector_subset.drop( ['ActivityProducedBy', 'ActivityConsumedBy'], axis=1, errors='ignore') # save comparison of FBA total to FBS total for an activity set compare_fba_geo_subset_and_fbs_output_totals( flows_subset_geo, fbs_sector_subset, aset, k, v, attr, method) log.info("Completed flowbysector for %s", aset) fbs_list.append(fbs_sector_subset) else: if 'clean_fbs_df_fxn' in v: flows = dynamically_import_fxn(v["clean_fbs_df_fxn_source"], v["clean_fbs_df_fxn"])(flows) flows = update_geoscale(flows, method['target_geoscale']) # if the loaded flow dt is already in FBS format, # append directly to list of FBS log.info("Append %s to FBS list", k) # ensure correct field datatypes and add any missing fields flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict) fbs_list.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbs_list, ignore_index=True, sort=False) log.info("Clean final dataframe") # add missing fields, ensure correct data type, # add missing columns, reorder columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) # prior to aggregating, replace MetaSources string with all sources # that share context/flowable/sector values fbss = harmonize_FBS_columns(fbss) # aggregate df as activities might have data for # the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") # ensure correct data types/order of columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # check for negative flow amounts check_for_negative_flowamounts(fbss) # tmp reset data quality scores fbss = reset_fbs_dq_scores(fbss) # save parquet file meta = set_fb_meta(method_name, "FlowBySector") write_df_to_file(fbss, paths, meta) write_metadata(method_name, method, meta, "FlowBySector") # rename the log file saved to local directory rename_log_file(method_name, meta) log.info( 'See the Validation log for detailed assessment of ' 'model results in %s', logoutputpath)